/** * Applies a Map-style operation to the entire partition of the data. * The function is called once per parallel partition of the data, * and the entire partition is available through the given Iterator. * The number of elements that each instance of the MapPartition function * sees is non deterministic and depends on the parallelism of the operation. * * <p>This function is intended for operations that cannot transform individual elements, * requires no grouping of elements. To transform individual elements, * the use of {@code map()} and {@code flatMap()} is preferable. * * @param mapPartition The MapPartitionFunction that is called for the full DataSet. * @return A MapPartitionOperator that represents the transformed DataSet. * * @see MapPartitionFunction * @see MapPartitionOperator */ public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapPartition) { if (mapPartition == null) { throw new NullPointerException("MapPartition function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getMapPartitionReturnTypes(mapPartition, getType(), callLocation, true); return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation); }
}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() { @Override public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
}).withBroadcastSet(elementCount, "counts");
@Override protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) { String name = getName() != null ? getName() : "MapPartition at " + defaultName; // create operator MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name); // set input po.setInput(input); // set parallelism if (this.getParallelism() > 0) { // use specified parallelism po.setParallelism(this.getParallelism()); } else { // if no parallelism has been specified, use parallelism of input operator to enable chaining po.setParallelism(input.getParallelism()); } return po; } }
private <IN, OUT> void createMapOperation(PythonOperationInfo info, TypeInformation<OUT> type) { DataSet<IN> op1 = sets.getDataSet(info.parentID); sets.add(info.setID, op1 .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name)); }
private DataSet<Tuple> translateMap(DataSet<Tuple> input, FlowNode node) { Fields outFields = getOutScope(node).getOutValuesFields(); registerKryoTypes(outFields); int dop = ((Operator)input).getParallelism(); return input .mapPartition(new EachMapper(node)) .returns(new TupleTypeInfo(outFields)) .withParameters(this.getFlinkNodeConfig(node)) .setParallelism(dop) .name("map-" + node.getID()); }
/** * {@inheritDoc} */ @Override public double updateModel(DataFlink<DataInstance> dataUpdate) { try { this.initLearning(); Configuration config = new Configuration(); config.setString(ParameterLearningAlgorithm.BN_NAME, this.dag.getName()); config.setBytes(EFBN_NAME, Serialization.serializeObject(efBayesianNetwork)); DataSet<DataInstance> dataset = dataUpdate.getDataSet(); this.sumSS = dataset.mapPartition(new SufficientSatisticsMAP()) .withParameters(config) .reduce(new SufficientSatisticsReduce()) .collect().get(0); //Add the prior sumSS.sum(efBayesianNetwork.createInitSufficientStatistics()); JobExecutionResult result = dataset.getExecutionEnvironment().getLastJobExecutionResult(); numInstances = result.getAccumulatorResult(ParallelMaximumLikelihood2.COUNTER_NAME+"_"+this.dag.getName()); numInstances++;//Initial counts }catch(Exception ex){ throw new UndeclaredThrowableException(ex); } return this.getLogMarginalProbability(); }
@Override protected void testProgram() throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<String, String>> data = env.fromCollection(input); data.mapPartition(new TestMapPartition()).output(new LocalCollectionOutputFormat<Tuple2<String, Integer>>(result)); env.execute(); }
@Override protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) { String name = getName() != null ? getName() : "MapPartition at " + defaultName; // create operator MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name); // set input po.setInput(input); // set parallelism if (this.getParallelism() > 0) { // use specified parallelism po.setParallelism(this.getParallelism()); } else { // if no parallelism has been specified, use parallelism of input operator to enable chaining po.setParallelism(input.getParallelism()); } return po; } }
private <IN, OUT> void createFlatMapOperation(PythonOperationInfo info, TypeInformation<OUT> type) { DataSet<IN> op1 = sets.getDataSet(info.parentID); sets.add(info.setID, op1 .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name)); }
.withParameters(this.getFlinkNodeConfig(node)) .setParallelism(probeSideDOP) .returns(new TupleTypeInfo(outFields)) .name("hashjoin-" + node.getID());
.partitionCustom(part, 0) .mapPartition(new IdentityPartitionerMapper<Tuple2<Integer,Integer>>()) .output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());
/** * Applies a Map-style operation to the entire partition of the data. * The function is called once per parallel partition of the data, * and the entire partition is available through the given Iterator. * The number of elements that each instance of the MapPartition function * sees is non deterministic and depends on the parallelism of the operation. * * <p>This function is intended for operations that cannot transform individual elements, * requires no grouping of elements. To transform individual elements, * the use of {@code map()} and {@code flatMap()} is preferable. * * @param mapPartition The MapPartitionFunction that is called for the full DataSet. * @return A MapPartitionOperator that represents the transformed DataSet. * * @see MapPartitionFunction * @see MapPartitionOperator */ public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapPartition) { if (mapPartition == null) { throw new NullPointerException("MapPartition function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getMapPartitionReturnTypes(mapPartition, getType(), callLocation, true); return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation); }
@Override protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) { String name = getName() != null ? getName() : "MapPartition at " + defaultName; // create operator MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name); // set input po.setInput(input); // set parallelism if (this.getParallelism() > 0) { // use specified parallelism po.setParallelism(this.getParallelism()); } else { // if no parallelism has been specified, use parallelism of input operator to enable chaining po.setParallelism(input.getParallelism()); } return po; } }
private <IN, OUT> void createFilterOperation(PythonOperationInfo info, TypeInformation<OUT> type) { DataSet<IN> op1 = sets.getDataSet(info.parentID); sets.add(info.setID, op1 .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name)); }
private static void transformSideInputs(List<PCollectionView<?>> sideInputs, MapPartitionOperator<?, ?> outputDataSet, FlinkBatchTranslationContext context) { // get corresponding Flink broadcast DataSets for(PCollectionView<?> input : sideInputs) { DataSet<?> broadcastSet = context.getSideInputDataSet(input); outputDataSet.withBroadcastSet(broadcastSet, input.getTagInternal().getId()); } }
}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() { @Override public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
.partitionCustom(part, "a") .mapPartition(new IdentityPartitionerMapper<Pojo>()) .output(new DiscardingOutputFormat<Pojo>());
/** * Applies a Map-style operation to the entire partition of the data. * The function is called once per parallel partition of the data, * and the entire partition is available through the given Iterator. * The number of elements that each instance of the MapPartition function * sees is non deterministic and depends on the parallelism of the operation. * * <p>This function is intended for operations that cannot transform individual elements, * requires no grouping of elements. To transform individual elements, * the use of {@code map()} and {@code flatMap()} is preferable. * * @param mapPartition The MapPartitionFunction that is called for the full DataSet. * @return A MapPartitionOperator that represents the transformed DataSet. * * @see MapPartitionFunction * @see MapPartitionOperator */ public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapPartition) { if (mapPartition == null) { throw new NullPointerException("MapPartition function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getMapPartitionReturnTypes(mapPartition, getType(), callLocation, true); return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation); }
private <IN, OUT> void createMapPartitionOperation(PythonOperationInfo info, TypeInformation<OUT> type) { DataSet<IN> op1 = sets.getDataSet(info.parentID); sets.add(info.setID, op1 .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type)) .setParallelism(info.parallelism).name(info.name)); }