@Test public void testBuild_ImplicitName() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals("ReduceByKey", reduce.getName()); }
@Test public void testBuild_Windowing() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .windowBy(Time.of(Duration.ofHours(1))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); assertNull(reduce.valueComparator); }
@Override protected Dataset<Pair<Integer, Long>> getOutput(Dataset<Pair<Integer, Long>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); return ReduceByKey.of(input) .keyBy(Pair::getFirst, Integer.class) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); }
.keyBy(Pair::getFirst) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofMillis(5))) .output();
@Test public void testWordCountBatch() throws Exception { Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(ListDataSource.bounded( asList("one two three four", "one two three", "one two", "one"))); // expand it to words Dataset<Pair<String, Long>> words = FlatMap.of(lines) .using(toWordCountPair()) .output(); // reduce it to counts, use windowing, so the output is batch or stream // depending on the type of input Dataset<Pair<String, Long>> streamOutput = ReduceByKey .of(words) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .output(); ListDataSink<Pair<String, Long>> out = ListDataSink.get(); streamOutput.persist(out); executor.submit(flow).get(); DatasetAssert.unorderedEquals( out.getOutputs(), Pair.of("one", 4L), Pair.of("two", 3L), Pair.of("three", 2L), Pair.of("four", 1L)); }
@Test public void testMapWithOutputGroupping() throws InterruptedException, ExecutionException { ListDataSource<String> input = ListDataSource.unbounded(asList( "one two three four four two two", "one one one two two three")); Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(input); // expand it to words Dataset<Pair<String, Long>> words = FlatMap.of(lines) .using(toWordCountPair()) .output(); ListDataSink<Pair<String, Long>> sink = ListDataSink.get(); // apply wordcount transform in output sink words.persist( sink.withPrepareDataset(d -> ReduceByKey.of(d) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .output() .persist(sink))); executor.submit(flow).get(); DatasetAssert.unorderedEquals( sink.getOutputs(), Pair.of("one", 4L), Pair.of("two", 5L), Pair.of("three", 2L), Pair.of("four", 2L)); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput( Dataset<Triple<Instant, Type, String>> input) { input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<ComparablePair<Type, String>> distinct = Distinct.of(input) .mapped(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window end timestamp return FlatMap .of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Test public void testBuild() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); Time<String> windowing = Time.of(Duration.ofHours(1)); Dataset<Pair<String, Long>> reduced = ReduceByKey.named("ReduceByKey1") .of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .windowBy(windowing) .output(); assertEquals(flow, reduced.getFlow()); assertEquals(1, flow.size()); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals(flow, reduce.getFlow()); assertEquals("ReduceByKey1", reduce.getName()); assertNotNull(reduce.getKeyExtractor()); assertNotNull(reduce.valueExtractor); assertNotNull(reduce.reducer); assertEquals(reduced, reduce.output()); assertSame(windowing, reduce.getWindowing()); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput( Dataset<Triple<Instant, Type, String>> input) { input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<ComparablePair<Type, String>> distinct = Distinct.of(input) .mapped(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window end timestamp return FlatMap .of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Pair<Integer, Long>> getOutput(Dataset<Pair<Integer, Long>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); return ReduceByKey.of(input) .keyBy(Pair::getFirst, Integer.class) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); }
@Override protected Dataset<Pair<Word, Long>> getOutput(Dataset<Pair<Word, Long>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); return ReduceByKey.of(input) .keyBy(Pair::getFirst) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); }
@Override protected Dataset<Pair<Word, Long>> getOutput(Dataset<Pair<Word, Long>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); return ReduceByKey.of(input) .keyBy(Pair::getFirst) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); }
@Override protected Dataset<Pair<String, Long>> getOutput(Dataset<String> input) { return ReduceByKey.of(input) .keyBy(e -> e) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .output(); } });
@Override protected Dataset<Pair<String, Long>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .windowBy(new CWindowing<>(3)) .output(); }
@Override protected Dataset<Pair<String, Long>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .windowBy(new CWindowing<>(3)) .output(); }
@Override protected Dataset<Pair<Integer, Long>> getOutput(Dataset<Integer> input) { return ReduceByKey.of(input) .keyBy(e -> e % 3, Integer.class) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(new TestWindowing()) .output(); }
@Override protected Dataset<Pair<String, Long>> getOutput(Dataset<String> input) { return ReduceByKey.of(input) .keyBy(e -> e) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .output(); } });
@Override protected Dataset<Pair<Integer, Long>> getOutput(Dataset<Integer> input) { return ReduceByKey.of(input) .keyBy(e -> e % 3, Integer.class) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(new TestWindowing()) .output(); }