org.apache.flink.streaming.api.datastream.DataStream.filter java code examples

/**
 * A thin wrapper layer over {@link DataStream#filter(FilterFunction)}.
 *
 * @param filter The FilterFunction that is called for each element of the DataStream.
 * @return The filtered {@link PythonDataStream}.
 */
public PythonSingleOutputStreamOperator filter(FilterFunction<PyObject> filter) throws IOException {
  return new PythonSingleOutputStreamOperator(stream.filter(new PythonFilterFunction(filter)));
}

  private static void addSmallBoundedJob(StreamExecutionEnvironment env, int parallelism) {
    DataStream<Long> stream = env.generateSequence(1, 100).setParallelism(parallelism);

    stream
        .filter(ignored -> false).setParallelism(parallelism)
          .startNewChain()
          .print().setParallelism(parallelism);
  }
}

@Test
public void testUnion() {
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  FilterFunction<Long> dummyFilter = new FilterFunction<Long>() {
    @Override
    public boolean filter(Long value) {
      return false;
    }
  };
  DataStream<Long> src1 = env.generateSequence(1, 10);
  DataStream<Long> src2 = env.generateSequence(1, 10).slotSharingGroup("src-1");
  // this should not inherit group "src-1"
  src1.union(src2).filter(dummyFilter);
  DataStream<Long> src3 = env.generateSequence(1, 10).slotSharingGroup("group-1");
  DataStream<Long> src4 = env.generateSequence(1, 10).slotSharingGroup("group-1");
  // this should inherit "group-1" now
  src3.union(src4).filter(dummyFilter);
  JobGraph jobGraph = env.getStreamGraph().getJobGraph();
  List<JobVertex> vertices = jobGraph.getVerticesSortedTopologicallyFromSources();
  // first pipeline
  assertEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(4).getSlotSharingGroup());
  assertNotEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(1).getSlotSharingGroup());
  assertNotEquals(vertices.get(1).getSlotSharingGroup(), vertices.get(4).getSlotSharingGroup());
  // second pipeline
  assertEquals(vertices.get(2).getSlotSharingGroup(), vertices.get(3).getSlotSharingGroup());
  assertEquals(vertices.get(2).getSlotSharingGroup(), vertices.get(5).getSlotSharingGroup());
  assertEquals(vertices.get(3).getSlotSharingGroup(), vertices.get(5).getSlotSharingGroup());
}

@Test
public void testInheritOverride() {
  // verify that we can explicitly disable inheritance of the input slot sharing groups
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  FilterFunction<Long> dummyFilter = new FilterFunction<Long>() {
    @Override
    public boolean filter(Long value) {
      return false;
    }
  };
  DataStream<Long> src1 = env.generateSequence(1, 10).slotSharingGroup("group-1");
  DataStream<Long> src2 = env.generateSequence(1, 10).slotSharingGroup("group-1");
  // this should not inherit group but be in "default"
  src1.union(src2).filter(dummyFilter).slotSharingGroup("default");
  JobGraph jobGraph = env.getStreamGraph().getJobGraph();
  List<JobVertex> vertices = jobGraph.getVerticesSortedTopologicallyFromSources();
  assertEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(1).getSlotSharingGroup());
  assertNotEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(2).getSlotSharingGroup());
  assertNotEquals(vertices.get(1).getSlotSharingGroup(), vertices.get(2).getSlotSharingGroup());
}

.filter(new LongRichFilterFunction())

/**
 * Runs the following program.
 * <pre>
 *     [ (source)->(filter)] -> [ (map) -> (map) ] -> [ (groupBy/reduce)->(sink) ]
 * </pre>
 */
@Override
public void testProgram(StreamExecutionEnvironment env) {
  assertTrue("Broken test setup", NUM_STRINGS % 40 == 0);
  final long failurePosMin = (long) (0.4 * NUM_STRINGS / PARALLELISM);
  final long failurePosMax = (long) (0.7 * NUM_STRINGS / PARALLELISM);
  final long failurePos = (new Random().nextLong() % (failurePosMax - failurePosMin)) + failurePosMin;
  env.enableCheckpointing(200);
  DataStream<String> stream = env.addSource(new StringGeneratingSourceFunction(NUM_STRINGS));
  stream
      // first vertex, chained to the source
      // this filter throttles the flow until at least one checkpoint
      // is complete, to make sure this program does not run without
      .filter(new StringRichFilterFunction())
          // -------------- seconds vertex - one-to-one connected ----------------
      .map(new StringPrefixCountRichMapFunction())
      .startNewChain()
      .map(new StatefulCounterFunction())
          // -------------- third vertex - reducer and the sink ----------------
      .keyBy("prefix")
      .flatMap(new OnceFailingAggregator(failurePos))
      .addSink(new ValidatingSink());
}

.filter(new FilterFunction<Integer>() {
  @Override
  public boolean filter(Integer value) throws Exception {
.filter(new FilterFunction<Tuple2<Integer, Integer>>() {

DataStream<Tuple2<Integer, Integer>> filter = map.filter(new FilterFunction<Tuple2<Integer, Integer>>() {
  @Override
  public boolean filter(Tuple2<Integer, Integer> value) throws Exception {

DataStream<Integer> increment = flatMap.filter(new FilterFunction<Integer>() {
  @Override
  public boolean filter(Integer value) throws Exception {

/**
 * Runs the following program.
 * <pre>
 *     [ (source)->(filter) ]-s->[ (map) ] -> [ (map) ] -> [ (groupBy/count)->(sink) ]
 * </pre>
 */
@Override
public void testProgram(StreamExecutionEnvironment env) {
  DataStream<String> stream = env.addSource(new StringGeneratingSourceFunction(NUM_STRINGS));
  stream
      // -------------- first vertex, chained to the source ----------------
      .filter(new StringRichFilterFunction())
      .shuffle()
      // -------------- seconds vertex - the stateful one that also fails ----------------
      .map(new StringPrefixCountRichMapFunction())
      .startNewChain()
      .map(new StatefulCounterFunction())
      // -------------- third vertex - counter and the sink ----------------
      .keyBy("prefix")
      .map(new OnceFailingPrefixCounter(NUM_STRINGS))
      .addSink(new SinkFunction<PrefixCount>() {
        @Override
        public void invoke(PrefixCount value) throws Exception {
          // Do nothing here
        }
      });
}

.filter(filterFunction);

.filter(new StringRichFilterFunction())

/**
 * Apply a filter to each edge in the graph stream
 *
 * @param filter the filter function to apply.
 * @return the filtered graph stream.
 */
@Override
public SimpleEdgeStream<K, EV> filterEdges(FilterFunction<Edge<K, EV>> filter) {
  DataStream<Edge<K, EV>> remainingEdges = this.edges.filter(filter);
  return new SimpleEdgeStream<>(remainingEdges, this.context);
}

/**
 * Apply a filter to each vertex in the graph stream
 * Since this is an edge-only stream, the vertex filter can only access the key of vertices
 *
 * @param filter the filter function to apply.
 * @return the filtered graph stream.
 */
@Override
public SimpleEdgeStream<K, EV> filterVertices(FilterFunction<Vertex<K, NullValue>> filter) {
  DataStream<Edge<K, EV>> remainingEdges = this.edges
      .filter(new ApplyVertexFilterToEdges<K, EV>(filter));
  return new SimpleEdgeStream<>(remainingEdges, this.context);
}

public static void main(String[] args) throws Exception {
  ParameterTool params = ParameterTool.fromArgs(args);
  final String input = params.get("input", ExerciseBase.pathToRideData);
  final int maxEventDelay = 60;       // events are out of order by max 60 seconds
  final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second
  // set up streaming execution environment
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
  env.setParallelism(ExerciseBase.parallelism);
  // start the data generator
  DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));
  DataStream<TaxiRide> filteredRides = rides
      // filter out rides that do not start or stop in NYC
      .filter(new NYCFilter());
  // print the filtered stream
  printOrTest(filteredRides);
  // run the cleansing pipeline
  env.execute("Taxi Ride Cleansing");
}

public static void main(String[] args) throws Exception {
  ParameterTool params = ParameterTool.fromArgs(args);
  final String input = params.get("input", pathToRideData);
  final int maxEventDelay = 60;       // events are out of order by max 60 seconds
  final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second
  // set up streaming execution environment
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
  env.setParallelism(ExerciseBase.parallelism);
  // start the data generator
  DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));
  DataStream<TaxiRide> filteredRides = rides
      // keep only those rides and both start and end in NYC
      .filter(new NYCFilter());
  // print the filtered stream
  printOrTest(filteredRides);
  // run the cleansing pipeline
  env.execute("Taxi Ride Cleansing");
}

public static void main(String[] args) throws Exception {
  ParameterTool params = ParameterTool.fromArgs(args);
  String input = params.getRequired("input");
  final int maxEventDelay = 60;       // events are out of order by max 60 seconds
  final int servingSpeedFactor = 600; // events of 10 minute are served in 1 second
  // set up streaming execution environment
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
  // start the data generator
  DataStream<TaxiRide> rides = env.addSource(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor));
  DataStream<TaxiRide> filteredRides = rides
      // filter out rides that do not start or stop in NYC
      .filter(new NYCFilter());
  // write the filtered data to a Kafka sink
  filteredRides.addSink(new FlinkKafkaProducer011<TaxiRide>(
      LOCAL_KAFKA_BROKER,
      CLEANSED_RIDES_TOPIC,
      new TaxiRideSchema()));
  // run the cleansing pipeline
  env.execute("Taxi Ride Cleansing");
}

public static void main(String[] args) throws Exception {
  ParameterTool params = ParameterTool.fromArgs(args);
  final String input = params.get("input", ExerciseBase.pathToRideData);
  final int popThreshold = params.getInt("threshold", 20);
  final int maxEventDelay = 60;       // events are out of order by max 60 seconds
  final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second
  // set up streaming execution environment
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
  env.setParallelism(ExerciseBase.parallelism);
  // start the data generator
  DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));
  // find n most popular spots
  DataStream<?> popularPlaces = rides
      // remove all rides which are not within NYC
      .filter(new NYCFilter())
      // match ride to grid cell and event type (start or end)
      .map(new GridCellMatcher());
  printOrTest(popularPlaces);
  env.execute("Popular Places");
}

public static void main(String[] args) throws Exception {
  ParameterTool params = ParameterTool.fromArgs(args);
  final String input = params.get("input", ExerciseBase.pathToRideData);
  final int servingSpeedFactor = 1800; // 30 minutes worth of events are served every second
  // set up streaming execution environment
  StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
  env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
  env.setParallelism(ExerciseBase.parallelism);
  // set up checkpointing
  env.setStateBackend(new FsStateBackend("file:///tmp/checkpoints"));
  env.enableCheckpointing(1000);
  env.setRestartStrategy(RestartStrategies.fixedDelayRestart(60, Time.of(10, TimeUnit.SECONDS)));
  DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new CheckpointedTaxiRideSource(input, servingSpeedFactor)));
  DataStream<TaxiRide> longRides = rides
      .filter(new NYCFilter())
      .keyBy((TaxiRide ride) -> ride.rideId)
      .process(new MatchFunction());
  printOrTest(longRides);
  env.execute("Long Taxi Rides (checkpointed)");
}

.filter(new NYCFilter())

Javadoc

Applies a Filter transformation on a DataStream. The transformation calls a FilterFunction for each element of the DataStream and retains only those element for which the function returns true. Elements for which the function returns false are filtered. The user can also extend RichFilterFunction to gain access to other features provided by the org.apache.flink.api.common.functions.RichFunction interface.

Popular methods of DataStream

addSink
Adds the given sink to this DataStream. Only streams with sinks added will be executed once the Stre
keyBy
Partitions the operator state of a DataStream using field expressions. A field expression is either
map
Applies a Map transformation on a DataStream. The transformation calls a MapFunction for each elemen
flatMap
Applies a FlatMap transformation on a DataStream. The transformation calls a FlatMapFunction for eac
getType
Gets the type of the stream.
union
Creates a new DataStream by merging DataStream outputs of the same type with each other. The DataStr
print
Writes a DataStream to the standard output stream (stdout).For each element of the DataStream the re
transform
Method for passing user defined operators along with the type information that will transform the Da
getExecutionEnvironment
Returns the StreamExecutionEnvironment that was used to create this DataStream.
getTransformation
Returns the StreamTransformation that represents the operation that logically creates this DataStrea
rebalance
Sets the partitioning of the DataStream so that the output elements are distributed evenly to instan
writeAsText
Writes a DataStream to the file specified by path in text format.For every element of the DataStream

Popular in Java

Reading from database using SQL prepared statement
onCreateOptionsMenu (Activity)
putExtra (Intent)
addToBackStack (FragmentTransaction)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
Socket (java.net)
Provides a client-side TCP socket.
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Option (scala)
Best IntelliJ plugins

How to use filtermethodin org.apache.flink.streaming.api.datastream.DataStream

Best Java code snippets using org.apache.flink.streaming.api.datastream.DataStream.filter (Showing top 20 results out of 315)

How to use
filter
method
in
org.apache.flink.streaming.api.datastream.DataStream