/** * Returns a {@link GroupedValues Combine.GroupedValues} {@code PTransform} that takes a {@code * PCollection} of {@code KV}s where a key maps to an {@code Iterable} of values, e.g., the result * of a {@code GroupByKey}, then uses the given {@code CombineFn} to combine all the values * associated with a key, ignoring the key. The types of the input and output values can differ. * * <p>Each output element has the same timestamp and is in the same window as its corresponding * input element, and the output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} associated with it as the input. * * <p>See {@link GroupedValues Combine.GroupedValues} for more information. * * <p>Note that {@link #perKey(CombineFnBase.GlobalCombineFn)} is typically more convenient to use * than {@link GroupByKey} followed by {@code groupedValues(...)}. */ public static <K, InputT, OutputT> GroupedValues<K, InputT, OutputT> groupedValues( GlobalCombineFn<? super InputT, ?, OutputT> fn) { return groupedValues(fn, displayDataForFn(fn)); }
@Override public PCollection<KV<Row, OutputT>> expand(PCollection<InputT> input) { return input.apply(byFields).apply(Combine.groupedValues(combineFn)); } }
/** * Returns a {@link GroupedValues Combine.GroupedValues} {@code PTransform} that takes a {@code * PCollection} of {@code KV}s where a key maps to an {@code Iterable} of values, e.g., the result * of a {@code GroupByKey}, then uses the given {@code SerializableFunction} to combine all the * values associated with a key, ignoring the key. The type of the input and output values must be * the same. * * <p>Each output element has the same timestamp and is in the same window as its corresponding * input element, and the output {@code PCollection} has the same {@link * org.apache.beam.sdk.transforms.windowing.WindowFn} associated with it as the input. * * <p>See {@link GroupedValues Combine.GroupedValues} for more information. * * <p>Note that {@link #perKey(SerializableFunction)} is typically more convenient to use than * {@link GroupByKey} followed by {@code groupedValues(...)}. */ public static <K, V> GroupedValues<K, V, V> groupedValues( SerializableFunction<Iterable<V>, V> fn) { return groupedValues(IterableCombineFn.of(fn), displayDataForFn(fn)); }
@Override public PCollection<KV<Row, Row>> expand(PCollection<InputT> input) { SchemaAggregateFn.Inner<InputT> fn = schemaAggregateFn.withSchema(input.getSchema(), input.getToRowFunction()); return input.apply(byFields).apply(Combine.groupedValues(fn)); } }
/** Creates a simple pipeline with a {@link Combine.GroupedValues}. */ private static TestPipeline createCombineGroupedValuesPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); input.apply(GroupByKey.create()).apply(Combine.groupedValues(new SumCombineFn())); return pipeline; }
/** Creates a simple pipeline with a {@link Combine.GroupedValues} with side inputs. */ private static TestPipeline createCombineGroupedValuesWithSideInputsPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); PCollection<String> sideInput = pipeline.apply(Create.of("side input")); PCollectionView<String> sideInputView = sideInput.apply(View.asSingleton()); input .apply(GroupByKey.create()) .apply( Combine.<String, Integer, Integer>groupedValues(new SumCombineFnWithContext()) .withSideInputs(sideInputView)); return pipeline; }
dataset1.apply(GroupByKey.create()).apply(Combine.groupedValues(new CountFn()));