org.apache.flink.api.java.operators.MapPartitionOperator java code examples

/**
 * Applies a Map-style operation to the entire partition of the data.
 * The function is called once per parallel partition of the data,
 * and the entire partition is available through the given Iterator.
 * The number of elements that each instance of the MapPartition function
 * sees is non deterministic and depends on the parallelism of the operation.
 *
 * <p>This function is intended for operations that cannot transform individual elements,
 * requires no grouping of elements. To transform individual elements,
 * the use of {@code map()} and {@code flatMap()} is preferable.
 *
 * @param mapPartition The MapPartitionFunction that is called for the full DataSet.
 * @return A MapPartitionOperator that represents the transformed DataSet.
 *
 * @see MapPartitionFunction
 * @see MapPartitionOperator
 */
public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapPartition) {
  if (mapPartition == null) {
    throw new NullPointerException("MapPartition function must not be null.");
  }
  String callLocation = Utils.getCallLocationName();
  TypeInformation<R> resultType = TypeExtractor.getMapPartitionReturnTypes(mapPartition, getType(), callLocation, true);
  return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation);
}

}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
  @Override
  public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {

}).withBroadcastSet(elementCount, "counts");

  @Override
  protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) {

    String name = getName() != null ? getName() : "MapPartition at " + defaultName;
    // create operator
    MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name);
    // set input
    po.setInput(input);
    // set parallelism
    if (this.getParallelism() > 0) {
      // use specified parallelism
      po.setParallelism(this.getParallelism());
    } else {
      // if no parallelism has been specified, use parallelism of input operator to enable chaining
      po.setParallelism(input.getParallelism());
    }

    return po;
  }
}

private <IN, OUT> void createMapOperation(PythonOperationInfo info, TypeInformation<OUT> type) {
  DataSet<IN> op1 = sets.getDataSet(info.parentID);
  sets.add(info.setID, op1
    .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type))
    .setParallelism(info.parallelism).name(info.name));
}

private DataSet<Tuple> translateMap(DataSet<Tuple> input, FlowNode node) {
  Fields outFields = getOutScope(node).getOutValuesFields();
  registerKryoTypes(outFields);
  int dop = ((Operator)input).getParallelism();
  return input
      .mapPartition(new EachMapper(node))
      .returns(new TupleTypeInfo(outFields))
      .withParameters(this.getFlinkNodeConfig(node))
      .setParallelism(dop)
      .name("map-" + node.getID());
}

/**
 * {@inheritDoc}
 */
@Override
public double updateModel(DataFlink<DataInstance> dataUpdate) {
  try {
    this.initLearning();
    Configuration config = new Configuration();
    config.setString(ParameterLearningAlgorithm.BN_NAME, this.dag.getName());
    config.setBytes(EFBN_NAME, Serialization.serializeObject(efBayesianNetwork));
    DataSet<DataInstance> dataset = dataUpdate.getDataSet();
    this.sumSS = dataset.mapPartition(new SufficientSatisticsMAP())
        .withParameters(config)
        .reduce(new SufficientSatisticsReduce())
        .collect().get(0);
    //Add the prior
    sumSS.sum(efBayesianNetwork.createInitSufficientStatistics());
    JobExecutionResult result = dataset.getExecutionEnvironment().getLastJobExecutionResult();
    numInstances = result.getAccumulatorResult(ParallelMaximumLikelihood2.COUNTER_NAME+"_"+this.dag.getName());
    numInstances++;//Initial counts
  }catch(Exception ex){
    throw new UndeclaredThrowableException(ex);
  }
  return this.getLogMarginalProbability();
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple2<String, String>> data = env.fromCollection(input);
  data.mapPartition(new TestMapPartition()).output(new LocalCollectionOutputFormat<Tuple2<String, Integer>>(result));
  env.execute();
}

  @Override
  protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) {

    String name = getName() != null ? getName() : "MapPartition at " + defaultName;
    // create operator
    MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name);
    // set input
    po.setInput(input);
    // set parallelism
    if (this.getParallelism() > 0) {
      // use specified parallelism
      po.setParallelism(this.getParallelism());
    } else {
      // if no parallelism has been specified, use parallelism of input operator to enable chaining
      po.setParallelism(input.getParallelism());
    }

    return po;
  }
}

private <IN, OUT> void createFlatMapOperation(PythonOperationInfo info, TypeInformation<OUT> type) {
  DataSet<IN> op1 = sets.getDataSet(info.parentID);
  sets.add(info.setID, op1
    .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type))
    .setParallelism(info.parallelism).name(info.name));
}

.withParameters(this.getFlinkNodeConfig(node))
.setParallelism(probeSideDOP)
.returns(new TupleTypeInfo(outFields))
.name("hashjoin-" + node.getID());

.partitionCustom(part, 0)
.mapPartition(new IdentityPartitionerMapper<Tuple2<Integer,Integer>>())
.output(new DiscardingOutputFormat<Tuple2<Integer, Integer>>());

/**
 * Applies a Map-style operation to the entire partition of the data.
 * The function is called once per parallel partition of the data,
 * and the entire partition is available through the given Iterator.
 * The number of elements that each instance of the MapPartition function
 * sees is non deterministic and depends on the parallelism of the operation.
 *
 * <p>This function is intended for operations that cannot transform individual elements,
 * requires no grouping of elements. To transform individual elements,
 * the use of {@code map()} and {@code flatMap()} is preferable.
 *
 * @param mapPartition The MapPartitionFunction that is called for the full DataSet.
 * @return A MapPartitionOperator that represents the transformed DataSet.
 *
 * @see MapPartitionFunction
 * @see MapPartitionOperator
 */
public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapPartition) {
  if (mapPartition == null) {
    throw new NullPointerException("MapPartition function must not be null.");
  }
  String callLocation = Utils.getCallLocationName();
  TypeInformation<R> resultType = TypeExtractor.getMapPartitionReturnTypes(mapPartition, getType(), callLocation, true);
  return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation);
}

  @Override
  protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) {

    String name = getName() != null ? getName() : "MapPartition at " + defaultName;
    // create operator
    MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name);
    // set input
    po.setInput(input);
    // set parallelism
    if (this.getParallelism() > 0) {
      // use specified parallelism
      po.setParallelism(this.getParallelism());
    } else {
      // if no parallelism has been specified, use parallelism of input operator to enable chaining
      po.setParallelism(input.getParallelism());
    }

    return po;
  }
}

private <IN, OUT> void createFilterOperation(PythonOperationInfo info, TypeInformation<OUT> type) {
  DataSet<IN> op1 = sets.getDataSet(info.parentID);
  sets.add(info.setID, op1
    .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type))
    .setParallelism(info.parallelism).name(info.name));
}

private static void transformSideInputs(List<PCollectionView<?>> sideInputs,
                    MapPartitionOperator<?, ?> outputDataSet,
                    FlinkBatchTranslationContext context) {
  // get corresponding Flink broadcast DataSets
  for(PCollectionView<?> input : sideInputs) {
    DataSet<?> broadcastSet = context.getSideInputDataSet(input);
    outputDataSet.withBroadcastSet(broadcastSet, input.getTagInternal().getId());
  }
}

}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
  @Override
  public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {

.partitionCustom(part, "a")
.mapPartition(new IdentityPartitionerMapper<Pojo>())
.output(new DiscardingOutputFormat<Pojo>());

/**
 * Applies a Map-style operation to the entire partition of the data.
 * The function is called once per parallel partition of the data,
 * and the entire partition is available through the given Iterator.
 * The number of elements that each instance of the MapPartition function
 * sees is non deterministic and depends on the parallelism of the operation.
 *
 * <p>This function is intended for operations that cannot transform individual elements,
 * requires no grouping of elements. To transform individual elements,
 * the use of {@code map()} and {@code flatMap()} is preferable.
 *
 * @param mapPartition The MapPartitionFunction that is called for the full DataSet.
 * @return A MapPartitionOperator that represents the transformed DataSet.
 *
 * @see MapPartitionFunction
 * @see MapPartitionOperator
 */
public <R> MapPartitionOperator<T, R> mapPartition(MapPartitionFunction<T, R> mapPartition) {
  if (mapPartition == null) {
    throw new NullPointerException("MapPartition function must not be null.");
  }
  String callLocation = Utils.getCallLocationName();
  TypeInformation<R> resultType = TypeExtractor.getMapPartitionReturnTypes(mapPartition, getType(), callLocation, true);
  return new MapPartitionOperator<>(this, resultType, clean(mapPartition), callLocation);
}

private <IN, OUT> void createMapPartitionOperation(PythonOperationInfo info, TypeInformation<OUT> type) {
  DataSet<IN> op1 = sets.getDataSet(info.parentID);
  sets.add(info.setID, op1
    .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type))
    .setParallelism(info.parallelism).name(info.name));
}

Javadoc

This operator represents the application of a "mapPartition" function on a data set, and the result data set produced by the function.

Most used methods

Popular in Java

Running tasks concurrently on multiple threads
getExternalFilesDir (Context)
scheduleAtFixedRate (ScheduledExecutorService)
getSupportFragmentManager (FragmentActivity)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Locale (java.util)
Locale represents a language/country/variant combination. Locales are used to alter the presentatio
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
Best IntelliJ plugins

How to useMapPartitionOperator in org.apache.flink.api.java.operators

Best Java code snippets using org.apache.flink.api.java.operators.MapPartitionOperator (Showing top 20 results out of 315)

How to use
MapPartitionOperator
in
org.apache.flink.api.java.operators