@Override protected Dataset<Pair<String, List<Long>>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst, String.class) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<List<Long>> coll) -> coll.collect(values.limit(2).collect(Collectors.toList()))) .withSortedValues(Long::compareTo) .output(); } });
@Override protected Dataset<Pair<String, List<Long>>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst, String.class) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<List<Long>> coll) -> coll.collect(values.limit(2).collect(Collectors.toList()))) .withSortedValues(Long::compareTo) .output(); } });
@Override protected Dataset<Pair<String, List<Long>>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst, String.class) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<List<Long>> coll) -> coll.collect(values.collect(Collectors.toList()))) .withSortedValues(Long::compareTo) .output(); } });
.valueBy(new ToHashMap<>(Pair::getFirst, Pair::getSecond)) .combineBy(new MergeMaps<>()) .output();
.valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .output();
.valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .output();
@Test public void testWordCountBatch() throws Exception { Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(ListDataSource.bounded( asList("one two three four", "one two three", "one two", "one"))); // expand it to words Dataset<Pair<String, Long>> words = FlatMap.of(lines) .using(toWordCountPair()) .output(); // reduce it to counts, use windowing, so the output is batch or stream // depending on the type of input Dataset<Pair<String, Long>> streamOutput = ReduceByKey .of(words) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .output(); ListDataSink<Pair<String, Long>> out = ListDataSink.get(); streamOutput.persist(out); executor.submit(flow).get(); DatasetAssert.unorderedEquals( out.getOutputs(), Pair.of("one", 4L), Pair.of("two", 3L), Pair.of("three", 2L), Pair.of("four", 1L)); }
@Test public void testMapWithOutputGroupping() throws InterruptedException, ExecutionException { ListDataSource<String> input = ListDataSource.unbounded(asList( "one two three four four two two", "one one one two two three")); Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(input); // expand it to words Dataset<Pair<String, Long>> words = FlatMap.of(lines) .using(toWordCountPair()) .output(); ListDataSink<Pair<String, Long>> sink = ListDataSink.get(); // apply wordcount transform in output sink words.persist( sink.withPrepareDataset(d -> ReduceByKey.of(d) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .output() .persist(sink))); executor.submit(flow).get(); DatasetAssert.unorderedEquals( sink.getOutputs(), Pair.of("one", 4L), Pair.of("two", 5L), Pair.of("three", 2L), Pair.of("four", 2L)); }
@Test public void testWindow_applyIf() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .withSortedValues(Long::compare) .applyIf(true, b -> b.windowBy(Time.of(Duration.ofHours(1)))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); }
@Test public void testWindow_applyIfNot() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .withSortedValues(Long::compare) .applyIf(false, b -> b, b -> b.windowBy(Time.of(Duration.ofHours(1)))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); }
@Test public void testBuild_sortedValuesWithNoWindowing() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .withSortedValues(Long::compare) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertNotNull(reduce.valueComparator); }
@Override public ListDataSink<Pair<Integer, Long>> modifySink( ListDataSink<Pair<Integer, Long>> sink) { return sink.withPrepareDataset(d -> { ReduceByKey.of(d) .keyBy(p -> p.getFirst() % 2) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<Long> c) -> values.forEach(c::collect)) .withSortedValues(Long::compare) .output() .persist(sink); }); }
@Override public ListDataSink<Pair<Integer, Long>> modifySink( ListDataSink<Pair<Integer, Long>> sink) { return sink.withPrepareDataset(d -> { ReduceByKey.of(d) .keyBy(p -> p.getFirst() % 2) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<Long> c) -> values.forEach(c::collect)) .withSortedValues(Long::compare) .output() .persist(sink); }); }
@Override protected Dataset<Pair<String, List<Long>>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst, String.class) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<List<Long>> coll) -> coll.collect(values.collect(Collectors.toList()))) .withSortedValues(Long::compareTo) .output(); } });
@Override protected Dataset<Pair<String, Long>> getOutput(Dataset<String> input) { return ReduceByKey.of(input) .keyBy(e -> e) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .output(); } });
@Test public void testBuild_ImplicitName() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals("ReduceByKey", reduce.getName()); }
@Override protected Dataset<Pair<String, Long>> getOutput(Dataset<String> input) { return ReduceByKey.of(input) .keyBy(e -> e) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .output(); } });