@Override protected Dataset<Pair<String, List<Long>>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst, String.class) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<List<Long>> coll) -> coll.collect(values.limit(2).collect(Collectors.toList()))) .withSortedValues(Long::compareTo) .output(); } });
@Test public void testBuild_OutputValues() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); Dataset<Long> reduced = ReduceByKey.named("ReduceByKeyValues") .of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .outputValues(); assertEquals(flow, reduced.getFlow()); assertEquals(2, flow.size()); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals(flow, reduce.getFlow()); assertEquals("ReduceByKeyValues", reduce.getName()); assertNotNull(reduce.getKeyExtractor()); assertNotNull(reduce.getValueExtractor()); assertNotNull(reduce.getReducer()); assertNull(reduce.getWindowing()); }
@Override public Dataset<Pair<KEY, OUT>> output(OutputHint... outputHints) { Flow flow = input.getFlow(); ReduceByKey<IN, KEY, VALUE, OUT, W> reduce = new ReduceByKey<>( name, flow, input, keyExtractor, keyClass, valueExtractor, windowing, reducer, valuesComparator, Sets.newHashSet(outputHints)); flow.add(reduce); return reduce.output(); } }
static boolean wantTranslate(ReduceByKey operator) { boolean b = operator.isCombinable() && (operator.getWindowing() == null || (!(operator.getWindowing() instanceof MergingWindowing) && !operator.getWindowing().getTrigger().isStateful())); return b; }
static boolean wantTranslate(ReduceByKey operator, SparkFlowTranslator.AcceptorContext context) { return (operator.getValueComparator() == null || ClassUtils.isComparable(operator.getKeyClass())) && (operator.getWindowing() == null || (!(operator.getWindowing() instanceof MergingWindowing) && !operator.getWindowing().getTrigger().isStateful())); }
final ReduceFunctor<VALUE, OUT> reducer = operator.getReducer(); operator.getWindowing() == null ? AttachedWindowing.INSTANCE : operator.getWindowing(); final UnaryFunction<IN, KEY> keyExtractor = operator.getKeyExtractor(); final UnaryFunction<IN, VALUE> valueExtractor = operator.getValueExtractor(); input .flatMapToPair(new CompositeKeyExtractor<>(keyExtractor, valueExtractor, windowing)) .setName(operator.getName() + "::extract-key-values"); if (operator.isCombinable()) { @SuppressWarnings("unchecked") final ReduceFunctor<VALUE, VALUE> combiner = (ReduceFunctor<VALUE, VALUE>) reducer; tuples .reduceByKey(new CombinableReducer<>(combiner)) .setName(operator.getName() + "::combine-by-key"); return new SparkElement<>(kw.window(), kw.timestamp(), Pair.of(kw.key(), el)); }) .setName(operator.getName() + "::wrap-in-spark-element"); if (ClassUtils.isComparable(operator.getKeyClass())) { final Partitioner partitioner = new HashPartitioner(input.getNumPartitions()); if (operator.getValueComparator() != null) { .setName(operator.getName() + "::create-composite-key")
@Test public void testBuild() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); Time<String> windowing = Time.of(Duration.ofHours(1)); Dataset<Pair<String, Long>> reduced = ReduceByKey.named("ReduceByKey1") .of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .windowBy(windowing) .output(); assertEquals(flow, reduced.getFlow()); assertEquals(1, flow.size()); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals(flow, reduce.getFlow()); assertEquals("ReduceByKey1", reduce.getName()); assertNotNull(reduce.getKeyExtractor()); assertNotNull(reduce.valueExtractor); assertNotNull(reduce.reducer); assertEquals(reduced, reduce.output()); assertSame(windowing, reduce.getWindowing()); }
@Test public void testBuild_Windowing() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .windowBy(Time.of(Duration.ofHours(1))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); assertNull(reduce.valueComparator); }
final UnaryFunctor<Stream, Object> reducer = origOperator.getReducer(); final Windowing windowing = origOperator.getWindowing() == null ? AttachedWindowing.INSTANCE : origOperator.getWindowing(); Preconditions.checkState(origOperator.isCombinable(), "Non-combinable ReduceByKey not supported!"); Preconditions.checkState( final UnaryFunction udfKey = origOperator.getKeyExtractor(); final UnaryFunction udfValue = origOperator.getValueExtractor();
@Test public void testBuild_ImplicitName() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .combineBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertEquals("ReduceByKey", reduce.getName()); }
@SuppressWarnings("unchecked") @Override public DAG<Operator<?, ?>> getBasicOps() { StateSupport.MergeFromStateMerger stateCombine = new StateSupport.MergeFromStateMerger<>(); StateFactory stateFactory = reducer.isCombinable() ? new CombiningReduceState.Factory<>((ReduceFunctor) reducer) : new NonCombiningReduceState.Factory<>(reducer, valueComparator); Flow flow = getFlow(); Operator reduceState = new ReduceStateByKey(getName(), flow, input, keyExtractor, valueExtractor, windowing, stateFactory, stateCombine, getHints()); return DAG.of(reduceState); }
@Test public void testWindow_applyIfNot() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .withSortedValues(Long::compare) .applyIf(false, b -> b, b -> b.windowBy(Time.of(Duration.ofHours(1)))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); }
@SuppressWarnings("unchecked") @Override public DAG<Operator<?, ?>> getBasicOps() { StateSupport.MergeFromStateMerger stateCombine = new StateSupport.MergeFromStateMerger<>(); StateFactory stateFactory = reducer.isCombinable() ? new CombiningReduceState.Factory<>((ReduceFunctor) reducer) : new NonCombiningReduceState.Factory<>(reducer, valueComparator); Flow flow = getFlow(); Operator reduceState = new ReduceStateByKey(getName(), flow, input, keyExtractor, valueExtractor, windowing, stateFactory, stateCombine, getHints()); return DAG.of(reduceState); }
final Dataset<Pair<String, Long>> aggregated = ReduceByKey.named("AGGREGATE") .of(parsedWithEventTime) .keyBy(LogLine::getIp)
@Override protected Dataset<Pair<String, List<Long>>> getOutput(Dataset<Pair<String, Long>> input) { return ReduceByKey.of(input) .keyBy(Pair::getFirst, String.class) .valueBy(Pair::getSecond) .reduceBy((Stream<Long> values, Collector<List<Long>> coll) -> coll.collect(values.limit(2).collect(Collectors.toList()))) .withSortedValues(Long::compareTo) .output(); } });
@Test public void testWindow_applyIf() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 2); ReduceByKey.of(dataset) .keyBy(s -> s) .valueBy(s -> 1L) .reduceBy(n -> StreamSupport.stream(n.spliterator(), false).mapToLong(Long::new).sum()) .withSortedValues(Long::compare) .applyIf(true, b -> b.windowBy(Time.of(Duration.ofHours(1)))) .output(); ReduceByKey reduce = (ReduceByKey) flow.operators().iterator().next(); assertTrue(reduce.getWindowing() instanceof Time); }
@Override public Dataset<Pair<KEY, OUT>> output(OutputHint... outputHints) { Flow flow = input.getFlow(); ReduceByKey<IN, KEY, VALUE, OUT, W> reduce = new ReduceByKey<>( name, flow, input, keyExtractor, keyClass, valueExtractor, windowing, reducer, valuesComparator, Sets.newHashSet(outputHints)); flow.add(reduce); return reduce.output(); } }
final Dataset<Pair<String, Long>> counted = ReduceByKey.named("REDUCE") .of(words) .keyBy(String::toLowerCase)
@Override protected Dataset<Pair<Integer, Set<Integer>>> getOutput(Dataset<Integer> input) { return ReduceByKey.of(input) .keyBy(e -> e % 2, Integer.class) .valueBy(e -> e) .reduceBy(s -> s.collect(Collectors.toSet())) .windowBy(Count.of(3)) .output(); }