input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<String, Integer>> pairs = ReduceByKey.of(input)
@Override public <IN> UsingBuilder<IN> of(Dataset<IN> input) { return new UsingBuilder<>(name, Objects.requireNonNull(input)); } }
.using(line -> line.getDate().getTime()) .output();
input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<String, Integer>> pairs = ReduceByKey.of(input)
input = AssignEventTime.of(input).using(e -> e * 1000L).output(); ReduceWindow.named("foo").of(input) .valueBy(e -> 1L)
lines = AssignEventTime.of(lines).using(i -> i.evtTs).output();
.using(Pair::getSecond) .output();
Pair.of("four", 2000000001004L))) .withReadDelay(Duration.ofMillis(100L))); input = AssignEventTime.of(input).using(Pair::getSecond).output();
input = AssignEventTime.of(input).using(e -> e * 1000L).output(); ReduceWindow.of(input) .valueBy(e -> 1L)
Dataset<Pair<String, Integer>> input = AssignEventTime.of(f.createInput(source)) .using(Pair::getSecond) .output(); Dataset<Pair<String, Long>> reduced =
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
input = AssignEventTime.of(input).using(e -> e * 1000L).output(); ReduceWindow.of(input) .valueBy(e -> e)
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput( Dataset<Triple<Instant, Type, String>> input) { input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<ComparablePair<Type, String>> distinct = Distinct.of(input) .mapped(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window end timestamp return FlatMap .of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput( Dataset<Triple<Instant, Type, String>> input) { input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<ComparablePair<Type, String>> distinct = Distinct.of(input) .mapped(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window end timestamp return FlatMap .of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Test public void testWithWatermarkAndEventTimeAndDiscarding() throws Exception { int N = 2000; // generate some small ints, use them as event time and count them // in 10s windows Dataset<Integer> input = flow.createInput( ListDataSource.unbounded(reversed(sequenceInts(0, N)))); ListDataSink<Long> outputs = ListDataSink.get(); input = AssignEventTime.of(input).using(e -> e * 1000L).output(); ReduceWindow.of(input) .valueBy(e -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofSeconds(10))) .output() .persist(outputs); // watermarking 100 ms executor.setTriggeringSchedulerSupplier( () -> new WatermarkTriggerScheduler(100)); executor.submit(flow).get(); // there should be only one element on output - the first element // all other windows are discarded List<Long> output = outputs.getOutputs(); assertEquals(1, output.size()); }
@Override protected Dataset<Triple<TimeInterval, Integer, Set<String>>> getOutput (Dataset<Pair<String, Integer>> input) { input = AssignEventTime.of(input).using(Pair::getSecond).output(); Dataset<Pair<Integer, Set<String>>> reduced = ReduceByKey.of(input) .keyBy(e -> e.getFirst().charAt(0) - '0') .valueBy(Pair::getFirst) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Session.of(Duration.ofMillis(5))) .output(); return FlatMap.of(reduced) .using((UnaryFunctor<Pair<Integer, Set<String>>, Triple<TimeInterval, Integer, Set<String>>>) (elem, context) -> context.collect(Triple.of((TimeInterval) context.getWindow(), elem.getFirst(), elem.getSecond()))) .output(); }
/** * Starts building a nameless {@link AssignEventTime} operator to (re-)assign * event time the given input dataset's elements. * * @param <IN> the type of elements of the input dataset * * @param input the input data set to be processed * * @return a builder to complete the setup of the new {@link AssignEventTime} * operator * * @see #named(String) * @see OfBuilder#of(Dataset) */ public static <IN> UsingBuilder<IN> of(Dataset<IN> input) { return new UsingBuilder<>("AssignEventTime", input); } }
@Override protected Dataset<Triple<TimeInterval, String, String>> getOutput(Dataset<Pair<String, Long>> left, Dataset<Pair<String, Long>> right) { left = AssignEventTime.of(left).using(Pair::getSecond).output(); right = AssignEventTime.of(right).using(Pair::getSecond).output(); Dataset<Pair<String, Triple<TimeInterval, String, String>>> joined = Join.of(left, right) .by(p -> "", p -> "", String.class) .using((Pair<String, Long> l, Pair<String, Long> r, Collector<Triple<TimeInterval, String, String>> c) -> { TimeInterval window = (TimeInterval) c.getWindow(); c.getCounter("cntr").increment(10); c.getHistogram("hist-" + l.getFirst().charAt(1)).add(2345, 8); c.collect(Triple.of(window, l.getFirst(), r.getFirst())); }) .windowBy(Time.of(Duration.ofMillis(3))) .output(); return MapElements.of(joined).using(Pair::getSecond).output(); }
@Override protected Dataset<Triple<TimeInterval, String, String>> getOutput(Dataset<Pair<String, Long>> left, Dataset<Pair<String, Long>> right) { left = AssignEventTime.of(left).using(Pair::getSecond).output(); right = AssignEventTime.of(right).using(Pair::getSecond).output(); Dataset<Pair<String, Triple<TimeInterval, String, String>>> joined = Join.of(left, right) .by(p -> "", p -> "", String.class) .using((Pair<String, Long> l, Pair<String, Long> r, Collector<Triple<TimeInterval, String, String>> c) -> { TimeInterval window = (TimeInterval) c.getWindow(); c.getCounter("cntr").increment(10); c.getHistogram("hist-" + l.getFirst().charAt(1)).add(2345, 8); c.collect(Triple.of(window, l.getFirst(), r.getFirst())); }) .windowBy(Time.of(Duration.ofMillis(3))) .output(); return MapElements.of(joined).using(Pair::getSecond).output(); }