.using(((Pair<String, Long> elem, Collector<String> context) -> { Date d = new Date(((TimeInterval) context.getWindow()).getStartMillis());
.using(ResultUtil.toCells()) .output();
.using((Pair<String, Integer> in, Collector<Triple<Instant, Instant, Integer>> out) -> { long windowBegin = ((TimeInterval) out.getWindow()).getStartMillis(); long windowEnd = ((TimeInterval) out.getWindow()).getEndMillis();
.using((UnaryFunctor<Pair<String, Pair<String, Integer>>, Pair<String, Integer>>) (elem, context) -> context.collect(elem.getSecond())) .eventTimeBy(e -> e.getSecond().getSecond())
.using((String line, Collector<String> c) -> SPLIT_RE.splitAsStream(line).forEach(c::collect)) .output();
.using((Pair<String, Integer> in, Collector<Triple<Instant, Instant, Integer>> out) -> { long windowBegin = ((TimeInterval) out.getWindow()).getStartMillis(); long windowEnd = ((TimeInterval) out.getWindow()).getEndMillis();
uri.getPath().substring(1), settings)); return FlatMap.of(input) .using(new UnaryFunctor<Pair<byte[], byte[]>, Pair<Long, String>>() { private final SearchEventsParser parser = new SearchEventsParser(); @Override return FlatMap.named("PARSE-INPUT") .of(in) .using(new UnaryFunctor<String, Pair<Long, String>>() { SearchEventsParser parser = new SearchEventsParser(); @Override
.using((UnaryFunctor<String, String>) (elem, c) -> { TimeInterval w = (TimeInterval) c.getWindow(); c.collect(w.getStartMillis() / 1000L + ": " + elem);
.using((Set<Integer> grp, Collector<Integer> c) -> { for (Integer i : grp) { c.collect(i);
.using((UnaryFunctor<Pair<String, Void>, TimeInterval>) (elem, context) -> context.collect((TimeInterval) context.getWindow())) .output();
.using((UnaryFunctor<Pair<String, Set<String>>, String>) (e, c) -> { e.getSecond().stream().forEachOrdered(c::collect); })
.using(toWordCountPair()) .output();
@Test public void testReduceByKeyWithSortStateAndCustomWindowing() throws InterruptedException, ExecutionException { Dataset<Integer> ints = flow.createInput( ListDataSource.unbounded( reversed(sequenceInts(0, 100)), reversed(sequenceInts(100, 1100)))); SizedCountWindowing<Integer> windowing = new SizedCountWindowing<>(i -> (i % 10) + 1); // the key for sort will be the last digit Dataset<Pair<Integer, Integer>> output = ReduceStateByKey.of(ints) .keyBy(i -> i % 10) .valueBy(e -> e) .stateFactory(SortState::new) .mergeStatesBy(SortState::combine) .windowBy(windowing) .output(); // collector of outputs ListDataSink<Triple<SizedCountWindow, Integer, Integer>> sink = ListDataSink.get(); FlatMap.of(output) .using((UnaryFunctor<Pair<Integer, Integer>, Triple<SizedCountWindow, Integer, Integer>>) (elem, context) -> context.collect(Triple.of((SizedCountWindow) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output() .persist(sink); executor.submit(flow).get(); List<Triple<SizedCountWindow, Integer, Integer>> outputs = sink.getOutputs(); assertEquals(4 * 550, outputs.size()); checkKeyAlignedSortedList(outputs); }
@Test public void testWordCountBatch() throws Exception { Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(ListDataSource.bounded( asList("one two three four", "one two three", "one two", "one"))); // expand it to words Dataset<Pair<String, Long>> words = FlatMap.of(lines) .using(toWordCountPair()) .output(); // reduce it to counts, use windowing, so the output is batch or stream // depending on the type of input Dataset<Pair<String, Long>> streamOutput = ReduceByKey .of(words) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .output(); ListDataSink<Pair<String, Long>> out = ListDataSink.get(); streamOutput.persist(out); executor.submit(flow).get(); DatasetAssert.unorderedEquals( out.getOutputs(), Pair.of("one", 4L), Pair.of("two", 3L), Pair.of("three", 2L), Pair.of("four", 1L)); }
@Test public void testMapWithOutputGroupping() throws InterruptedException, ExecutionException { ListDataSource<String> input = ListDataSource.unbounded(asList( "one two three four four two two", "one one one two two three")); Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput(input); // expand it to words Dataset<Pair<String, Long>> words = FlatMap.of(lines) .using(toWordCountPair()) .output(); ListDataSink<Pair<String, Long>> sink = ListDataSink.get(); // apply wordcount transform in output sink words.persist( sink.withPrepareDataset(d -> ReduceByKey.of(d) .keyBy(Pair::getFirst) .valueBy(Pair::getSecond) .combineBy(Sums.ofLongs()) .output() .persist(sink))); executor.submit(flow).get(); DatasetAssert.unorderedEquals( sink.getOutputs(), Pair.of("one", 4L), Pair.of("two", 5L), Pair.of("three", 2L), Pair.of("four", 2L)); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }