/** * Applies a GroupReduce transformation on a non-grouped {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} once with the full DataSet. * The GroupReduceFunction can iterate over all elements of the DataSet and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see org.apache.flink.api.java.operators.GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, getType(), callLocation, true); return new GroupReduceOperator<>(this, resultType, clean(reducer), callLocation); }
/** * Applies a GroupReduce transformation on a grouped and sorted {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet. * A GroupReduceFunction can iterate over all elements of a group and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on each group of the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, inputDataSet.getType(), Utils.getCallLocationName(), true); return new GroupReduceOperator<>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName()); }
/** * Applies a GroupReduce transformation on a grouped {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet. * A GroupReduceFunction can iterate over all elements of a group and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on each group of the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, this.getInputDataSet().getType(), Utils.getCallLocationName(), true); return new GroupReduceOperator<T, R>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName()); }
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
/** * Applies a GroupReduce transformation on a non-grouped {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} once with the full DataSet. * The GroupReduceFunction can iterate over all elements of the DataSet and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see org.apache.flink.api.java.operators.GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, getType(), callLocation, true); return new GroupReduceOperator<>(this, resultType, clean(reducer), callLocation); }
/** * Applies a GroupReduce transformation on a non-grouped {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} once with the full DataSet. * The GroupReduceFunction can iterate over all elements of the DataSet and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see org.apache.flink.api.java.operators.GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, getType(), callLocation, true); return new GroupReduceOperator<>(this, resultType, clean(reducer), callLocation); }
/** * Applies a GroupReduce transformation on a grouped and sorted {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet. * A GroupReduceFunction can iterate over all elements of a group and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on each group of the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, inputDataSet.getType(), Utils.getCallLocationName(), true); return new GroupReduceOperator<>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName()); }
/** * Applies a GroupReduce transformation on a grouped and sorted {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet. * A GroupReduceFunction can iterate over all elements of a group and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on each group of the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, inputDataSet.getType(), Utils.getCallLocationName(), true); return new GroupReduceOperator<>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName()); }
/** * Applies a GroupReduce transformation on a grouped {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet. * A GroupReduceFunction can iterate over all elements of a group and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on each group of the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, this.getInputDataSet().getType(), Utils.getCallLocationName(), true); return new GroupReduceOperator<T, R>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName()); }
/** * Applies a GroupReduce transformation on a grouped {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} for each group of the DataSet. * A GroupReduceFunction can iterate over all elements of a group and emit any * number of output elements including none. * * @param reducer The GroupReduceFunction that is applied on each group of the DataSet. * @return A GroupReduceOperator that represents the reduced DataSet. * * @see org.apache.flink.api.common.functions.RichGroupReduceFunction * @see GroupReduceOperator * @see DataSet */ public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) { if (reducer == null) { throw new NullPointerException("GroupReduce function must not be null."); } TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, this.getInputDataSet().getType(), Utils.getCallLocationName(), true); return new GroupReduceOperator<T, R>(this, resultType, inputDataSet.clean(reducer), Utils.getCallLocationName()); }
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
new GroupReduceOperator(grouping, typeInformation, doFnWrapper, transform.getName());
@Override public void translateNode(GroupByKey.GroupByKeyOnly<K, V> transform, FlinkBatchTranslationContext context) { DataSet<KV<K, V>> inputDataSet = context.getInputDataSet(context.getInput(transform)); GroupReduceFunction<KV<K, V>, KV<K, Iterable<V>>> groupReduceFunction = new FlinkKeyedListAggregationFunction<>(); TypeInformation<KV<K, Iterable<V>>> typeInformation = context.getTypeInfo(context.getOutput(transform)); Grouping<KV<K, V>> grouping = new UnsortedGrouping<>(inputDataSet, new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet.getType())); GroupReduceOperator<KV<K, V>, KV<K, Iterable<V>>> outputDataSet = new GroupReduceOperator<>(grouping, typeInformation, groupReduceFunction, transform.getName()); context.setOutputDataSet(context.getOutput(transform), outputDataSet); } }
@Override public void translateNode(GroupByKey<K, V> transform, FlinkBatchTranslationContext context) { DataSet<KV<K, V>> inputDataSet = context.getInputDataSet(context.getInput(transform)); GroupReduceFunction<KV<K, V>, KV<K, Iterable<V>>> groupReduceFunction = new FlinkKeyedListAggregationFunction<>(); TypeInformation<KV<K, Iterable<V>>> typeInformation = context.getTypeInfo(context.getOutput(transform)); Grouping<KV<K, V>> grouping = new UnsortedGrouping<>(inputDataSet, new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet.getType())); GroupReduceOperator<KV<K, V>, KV<K, Iterable<V>>> outputDataSet = new GroupReduceOperator<>(grouping, typeInformation, groupReduceFunction, transform.getName()); context.setOutputDataSet(context.getOutput(transform), outputDataSet); } }
new GroupReduceOperator<>( intermediateGrouping, partialReduceTypeInfo, reduceFunction, fullName);
new GroupReduceOperator<>( intermediateGrouping, partialReduceTypeInfo, reduceFunction, fullName);
new GroupReduceOperator<>( intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getName());
new GroupReduceOperator<>( intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName()); new GroupReduceOperator<>( grouping, reduceTypeInfo, reduceFunction, transform.getName());
new GroupReduceOperator<>(intermediateGrouping, reduceTypeInfo, reduceFunction, transform.getName());