/** * A thin wrapper layer over {@link DataStream#filter(FilterFunction)}. * * @param filter The FilterFunction that is called for each element of the DataStream. * @return The filtered {@link PythonDataStream}. */ public PythonSingleOutputStreamOperator filter(FilterFunction<PyObject> filter) throws IOException { return new PythonSingleOutputStreamOperator(stream.filter(new PythonFilterFunction(filter))); }
private static void addSmallBoundedJob(StreamExecutionEnvironment env, int parallelism) { DataStream<Long> stream = env.generateSequence(1, 100).setParallelism(parallelism); stream .filter(ignored -> false).setParallelism(parallelism) .startNewChain() .print().setParallelism(parallelism); } }
@Test public void testUnion() { StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); FilterFunction<Long> dummyFilter = new FilterFunction<Long>() { @Override public boolean filter(Long value) { return false; } }; DataStream<Long> src1 = env.generateSequence(1, 10); DataStream<Long> src2 = env.generateSequence(1, 10).slotSharingGroup("src-1"); // this should not inherit group "src-1" src1.union(src2).filter(dummyFilter); DataStream<Long> src3 = env.generateSequence(1, 10).slotSharingGroup("group-1"); DataStream<Long> src4 = env.generateSequence(1, 10).slotSharingGroup("group-1"); // this should inherit "group-1" now src3.union(src4).filter(dummyFilter); JobGraph jobGraph = env.getStreamGraph().getJobGraph(); List<JobVertex> vertices = jobGraph.getVerticesSortedTopologicallyFromSources(); // first pipeline assertEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(4).getSlotSharingGroup()); assertNotEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(1).getSlotSharingGroup()); assertNotEquals(vertices.get(1).getSlotSharingGroup(), vertices.get(4).getSlotSharingGroup()); // second pipeline assertEquals(vertices.get(2).getSlotSharingGroup(), vertices.get(3).getSlotSharingGroup()); assertEquals(vertices.get(2).getSlotSharingGroup(), vertices.get(5).getSlotSharingGroup()); assertEquals(vertices.get(3).getSlotSharingGroup(), vertices.get(5).getSlotSharingGroup()); }
@Test public void testInheritOverride() { // verify that we can explicitly disable inheritance of the input slot sharing groups StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); FilterFunction<Long> dummyFilter = new FilterFunction<Long>() { @Override public boolean filter(Long value) { return false; } }; DataStream<Long> src1 = env.generateSequence(1, 10).slotSharingGroup("group-1"); DataStream<Long> src2 = env.generateSequence(1, 10).slotSharingGroup("group-1"); // this should not inherit group but be in "default" src1.union(src2).filter(dummyFilter).slotSharingGroup("default"); JobGraph jobGraph = env.getStreamGraph().getJobGraph(); List<JobVertex> vertices = jobGraph.getVerticesSortedTopologicallyFromSources(); assertEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(1).getSlotSharingGroup()); assertNotEquals(vertices.get(0).getSlotSharingGroup(), vertices.get(2).getSlotSharingGroup()); assertNotEquals(vertices.get(1).getSlotSharingGroup(), vertices.get(2).getSlotSharingGroup()); }
.filter(new LongRichFilterFunction())
/** * Runs the following program. * <pre> * [ (source)->(filter)] -> [ (map) -> (map) ] -> [ (groupBy/reduce)->(sink) ] * </pre> */ @Override public void testProgram(StreamExecutionEnvironment env) { assertTrue("Broken test setup", NUM_STRINGS % 40 == 0); final long failurePosMin = (long) (0.4 * NUM_STRINGS / PARALLELISM); final long failurePosMax = (long) (0.7 * NUM_STRINGS / PARALLELISM); final long failurePos = (new Random().nextLong() % (failurePosMax - failurePosMin)) + failurePosMin; env.enableCheckpointing(200); DataStream<String> stream = env.addSource(new StringGeneratingSourceFunction(NUM_STRINGS)); stream // first vertex, chained to the source // this filter throttles the flow until at least one checkpoint // is complete, to make sure this program does not run without .filter(new StringRichFilterFunction()) // -------------- seconds vertex - one-to-one connected ---------------- .map(new StringPrefixCountRichMapFunction()) .startNewChain() .map(new StatefulCounterFunction()) // -------------- third vertex - reducer and the sink ---------------- .keyBy("prefix") .flatMap(new OnceFailingAggregator(failurePos)) .addSink(new ValidatingSink()); }
DataStream<Tuple2<Integer, Integer>> filter = map.filter(new FilterFunction<Tuple2<Integer, Integer>>() { @Override public boolean filter(Tuple2<Integer, Integer> value) throws Exception {
DataStream<Integer> increment = flatMap.filter(new FilterFunction<Integer>() { @Override public boolean filter(Integer value) throws Exception {
/** * Runs the following program. * <pre> * [ (source)->(filter) ]-s->[ (map) ] -> [ (map) ] -> [ (groupBy/count)->(sink) ] * </pre> */ @Override public void testProgram(StreamExecutionEnvironment env) { DataStream<String> stream = env.addSource(new StringGeneratingSourceFunction(NUM_STRINGS)); stream // -------------- first vertex, chained to the source ---------------- .filter(new StringRichFilterFunction()) .shuffle() // -------------- seconds vertex - the stateful one that also fails ---------------- .map(new StringPrefixCountRichMapFunction()) .startNewChain() .map(new StatefulCounterFunction()) // -------------- third vertex - counter and the sink ---------------- .keyBy("prefix") .map(new OnceFailingPrefixCounter(NUM_STRINGS)) .addSink(new SinkFunction<PrefixCount>() { @Override public void invoke(PrefixCount value) throws Exception { // Do nothing here } }); }
.filter(filterFunction);
.filter(new StringRichFilterFunction())
/** * Apply a filter to each edge in the graph stream * * @param filter the filter function to apply. * @return the filtered graph stream. */ @Override public SimpleEdgeStream<K, EV> filterEdges(FilterFunction<Edge<K, EV>> filter) { DataStream<Edge<K, EV>> remainingEdges = this.edges.filter(filter); return new SimpleEdgeStream<>(remainingEdges, this.context); }
/** * Apply a filter to each vertex in the graph stream * Since this is an edge-only stream, the vertex filter can only access the key of vertices * * @param filter the filter function to apply. * @return the filtered graph stream. */ @Override public SimpleEdgeStream<K, EV> filterVertices(FilterFunction<Vertex<K, NullValue>> filter) { DataStream<Edge<K, EV>> remainingEdges = this.edges .filter(new ApplyVertexFilterToEdges<K, EV>(filter)); return new SimpleEdgeStream<>(remainingEdges, this.context); }
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final String input = params.get("input", ExerciseBase.pathToRideData); final int maxEventDelay = 60; // events are out of order by max 60 seconds final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(ExerciseBase.parallelism); // start the data generator DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor))); DataStream<TaxiRide> filteredRides = rides // filter out rides that do not start or stop in NYC .filter(new NYCFilter()); // print the filtered stream printOrTest(filteredRides); // run the cleansing pipeline env.execute("Taxi Ride Cleansing"); }
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final String input = params.get("input", pathToRideData); final int maxEventDelay = 60; // events are out of order by max 60 seconds final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(ExerciseBase.parallelism); // start the data generator DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor))); DataStream<TaxiRide> filteredRides = rides // keep only those rides and both start and end in NYC .filter(new NYCFilter()); // print the filtered stream printOrTest(filteredRides); // run the cleansing pipeline env.execute("Taxi Ride Cleansing"); }
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); String input = params.getRequired("input"); final int maxEventDelay = 60; // events are out of order by max 60 seconds final int servingSpeedFactor = 600; // events of 10 minute are served in 1 second // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); // start the data generator DataStream<TaxiRide> rides = env.addSource(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)); DataStream<TaxiRide> filteredRides = rides // filter out rides that do not start or stop in NYC .filter(new NYCFilter()); // write the filtered data to a Kafka sink filteredRides.addSink(new FlinkKafkaProducer011<TaxiRide>( LOCAL_KAFKA_BROKER, CLEANSED_RIDES_TOPIC, new TaxiRideSchema())); // run the cleansing pipeline env.execute("Taxi Ride Cleansing"); }
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final String input = params.get("input", ExerciseBase.pathToRideData); final int popThreshold = params.getInt("threshold", 20); final int maxEventDelay = 60; // events are out of order by max 60 seconds final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(ExerciseBase.parallelism); // start the data generator DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor))); // find n most popular spots DataStream<?> popularPlaces = rides // remove all rides which are not within NYC .filter(new NYCFilter()) // match ride to grid cell and event type (start or end) .map(new GridCellMatcher()); printOrTest(popularPlaces); env.execute("Popular Places"); }
public static void main(String[] args) throws Exception { ParameterTool params = ParameterTool.fromArgs(args); final String input = params.get("input", ExerciseBase.pathToRideData); final int servingSpeedFactor = 1800; // 30 minutes worth of events are served every second // set up streaming execution environment StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); env.setParallelism(ExerciseBase.parallelism); // set up checkpointing env.setStateBackend(new FsStateBackend("file:///tmp/checkpoints")); env.enableCheckpointing(1000); env.setRestartStrategy(RestartStrategies.fixedDelayRestart(60, Time.of(10, TimeUnit.SECONDS))); DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new CheckpointedTaxiRideSource(input, servingSpeedFactor))); DataStream<TaxiRide> longRides = rides .filter(new NYCFilter()) .keyBy((TaxiRide ride) -> ride.rideId) .process(new MatchFunction()); printOrTest(longRides); env.execute("Long Taxi Rides (checkpointed)"); }
.filter(new NYCFilter())