@Override protected Dataset<Triple<TimeInterval, Integer, Set<String>>> getOutput (Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); Dataset<Pair<Integer, Set<String>>> reduced = ReduceByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, Set<String>>, Triple<TimeInterval, Integer, Set<String>>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Test public void testBuild_ReduceBy() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertNotNull(reduce.reducer); }
.withSortedValues(comparator::compare) .applyIf(windowing != null, b -> b.windowBy(windowing)) .outputValues();
.keyBy(Object::hashCode) .reduceBy((Stream<Cell> s, Collector<Cell> ctx) -> s.forEach(ctx::collect)) .outputValues();
.valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toMap(Pair::getFirst, Pair::getSecond))) .output();
.keyBy(e -> e) .reduceBy(values -> 1L) .output(ComputationHint.EXPENSIVE);
.valueBy(e -> 1L) .reduceBy(Sums.ofLongs()) .windowBy(TimeSliding.of(Duration.ofMillis(10), Duration.ofMillis(5))) .output();
.valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Count.of(3)) .output();
.keyBy(e -> e) .reduceBy(values -> 1L) .windowBy(Time.of(Duration.ofSeconds(1))) .output(new Util.TestHint(), new Util.TestHint2());
@Test(timeout = 5000L) public void testInputMultiConsumption() throws InterruptedException, ExecutionException { final int N = 1000; Dataset<Integer> input = flow.createInput( ListDataSource.unbounded(sequenceInts(0, N))); // there seems to be bug in LocalExecutor // that makes it impossible to consume the // same dataset twice by single union operator Dataset<Integer> first = MapElements.of(input) .using(e -> e) .output(); Dataset<Integer> second = MapElements.of(input) .using(e -> e) .output(); // ~ consume the input another time Dataset<Integer> union = Union.of(first, second) .output(); Dataset<Pair<Integer, Integer>> sum = ReduceByKey .of(union) .keyBy(e -> 0) .valueBy(e -> e) .reduceBy(Sums.ofInts()) .output(); ListDataSink<Pair<Integer, Integer>> sumOut = ListDataSink.get(); sum.persist(sumOut); executor.submit(flow).get(); DatasetAssert.unorderedEquals( sumOut.getOutputs(), Pair.of(0, 2 * (N - 1) * N / 2)); }
@Test public void testBasics() throws Exception { final Duration READ_DELAY = Duration.ofMillis(100L); ListDataSink<Set<String>> out = ListDataSink.get(); Fluent.flow("Test") .read(ListDataSource.unbounded( asList("0-one 1-two 0-three 1-four 0-five 1-six 0-seven".split(" "))) .withReadDelay(READ_DELAY)) // ~ create windows of size three .apply(input -> ReduceByKey.of(input) .keyBy(e -> "") .valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Count.of(3))) // ~ strip the needless key and flatten out the elements thereby // creating multiple elements in the output belonging to the same window .flatMap((Pair<String, Set<String>> e, Collector<String> c) -> e.getSecond().stream().forEachOrdered(c::collect)) // ~ we now expect to reconstruct the same windowing // as the very initial step .apply(input -> ReduceByKey.of(input) .keyBy(e -> "") .valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toSet()))) // ~ strip the needless key .mapElements(Pair::getSecond) .persist(out) .execute(new LocalExecutor()); }
@Override protected Dataset<List<Pair<Integer, List<Integer>>>> getOutput(Dataset<Integer> input) { Dataset<Pair<Integer, List<Integer>>> reducedByWindow = ReduceByKey.of(input) .keyBy(e -> e % 2, Integer.class) .valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toList())) .withSortedValues(Integer::compare) .windowBy(Count.of(3)) .output(); return ReduceWindow.of(reducedByWindow) .reduceBy(s -> s.collect(Collectors.toList())) .withSortedValues((l, r) -> { int cmp = l.getFirst().compareTo(r.getFirst()); if (cmp == 0) { int firstLeft = l.getSecond().get(0); int firstRight = r.getSecond().get(0); cmp = Integer.compare(firstLeft, firstRight); } return cmp; }) .windowBy(GlobalWindowing.get()) .output(); }
@Override protected Dataset<List<Pair<Integer, List<Integer>>>> getOutput(Dataset<Integer> input) { Dataset<Pair<Integer, List<Integer>>> reducedByWindow = ReduceByKey.of(input) .keyBy(e -> e % 2, Integer.class) .valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toList())) .withSortedValues(Integer::compare) .windowBy(Count.of(3)) .output(); return ReduceWindow.of(reducedByWindow) .reduceBy(s -> s.collect(Collectors.toList())) .withSortedValues((l, r) -> { int cmp = l.getFirst().compareTo(r.getFirst()); if (cmp == 0) { int firstLeft = l.getSecond().get(0); int firstRight = r.getSecond().get(0); cmp = Integer.compare(firstLeft, firstRight); } return cmp; }) .windowBy(GlobalWindowing.get()) .output(); }
@Test(expected = IllegalArgumentException.class) public void testMultipleOutputsToSameSink() throws Exception { flow = Flow.create(getClass().getSimpleName()); input = flow.createInput(new MockStreamDataSource<>()); Dataset<Object> mapped = MapElements.of(input).using(e -> e).output(); Dataset<Pair<Object, Long>> reduced = ReduceByKey .of(mapped) .keyBy(e -> e).reduceBy(values -> 1L) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); Dataset<Pair<Object, Long>> output = Join.of(mapped, reduced) .by(e -> e, Pair::getFirst) .using((Object l, Pair<Object, Long> r, Collector<Long> c) -> { c.collect(r.getSecond()); }) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); ListDataSink<Pair<Object, Long>> sink = ListDataSink.get(); output.persist(sink); reduced.persist(sink); FlowUnfolder.unfold(flow, Executor.getBasicOps()); }
@Test public void testBuild_OutputValues() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); Dataset<Long> reduced = ReduceByKey.named("ReduceByKeyValues") .of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .outputValues(); assertEquals(flow, reduced.getFlow()); assertEquals(2, flow.size()); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals(flow, reduce.getFlow()); assertEquals("ReduceByKeyValues", reduce.getName()); assertNotNull(reduce.getKeyExtractor()); assertNotNull(reduce.getValueExtractor()); assertNotNull(reduce.getReducer()); assertNull(reduce.getWindowing()); }
@Override protected Dataset<Triple<TimeInterval, Integer, Set<String>>> getOutput (Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); Dataset<Pair<Integer, Set<String>>> reduced = ReduceByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, Set<String>>, Triple<TimeInterval, Integer, Set<String>>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
@Override protected Dataset<Pair<Integer, Integer>> getOutput(Dataset<Integer> input) { return ReduceByKey.of(input) .keyBy(e -> e % 2) .valueBy(e -> e) .reduceBy(Fold.of(0, (Integer a, Integer b, Collector<Integer> ctx) -> { if (b % 2 == 0) { ctx.getCounter("evens").increment(); } else { ctx.getCounter("odds").increment(); } ctx.collect(a + b); })) .windowBy(GlobalWindowing.get()) .output(); }
@Before public void before() throws Exception { flow = Flow.create(getClass().getSimpleName()); input = flow.createInput(new MockStreamDataSource<>()); Dataset<Object> mapped = MapElements.of(input).using(e -> e).output(); Dataset<Pair<Object, Long>> reduced = ReduceByKey .of(mapped) .keyBy(e -> e).reduceBy(values -> 1L) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); Dataset<Pair<Object, Long>> output = Join.of(mapped, reduced) .by(e -> e, Pair::getFirst) .using((Object l, Pair<Object, Long> r, Collector<Long> c) -> c.collect(r.getSecond())) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); output.persist(new StdoutSink<>()); }
@Override protected Dataset<Pair<Integer, Integer>> getOutput(Dataset<Integer> input) { return ReduceByKey.of(input) .keyBy(e -> e % 2) .valueBy(e -> e) .reduceBy(Fold.of(0, (Integer a, Integer b, Collector<Integer> ctx) -> { if (b % 2 == 0) { ctx.getCounter("evens").increment(); } else { ctx.getCounter("odds").increment(); } ctx.collect(a + b); })) .windowBy(GlobalWindowing.get()) .output(); }
@Test public void testWindow_applyIf() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .withSortedValues(Long::compare) .applyIf(true, b -> b.windowBy(Time.of(Duration.ofHours(1)))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); }