org.apache.flink.api.java.operators.PartitionOperator java code examples

@Override
public Graph<LongValue, NullValue, NullValue> generate() {
  int scale = Long.SIZE - Long.numberOfLeadingZeros(vertexCount - 1);
  // Edges
  int cyclesPerEdge = noiseEnabled ? 5 * scale : scale;
  List<BlockInfo<T>> generatorBlocks = randomGenerableFactory
    .getRandomGenerables(edgeCount, cyclesPerEdge);
  DataSet<Edge<LongValue, NullValue>> edges = env
    .fromCollection(generatorBlocks)
      .name("Random generators")
    .rebalance()
      .setParallelism(parallelism)
      .name("Rebalance")
    .flatMap(new GenerateEdges<>(vertexCount, scale, a, b, c, noiseEnabled, noise))
      .setParallelism(parallelism)
      .name("RMat graph edges");
  // Vertices
  DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSet(edges, parallelism);
  // Graph
  return Graph.fromDataSet(vertices, edges, env);
}

private <K extends Tuple> void createHashPartitionOperation(PythonOperationInfo info) {
  DataSet<Tuple2<K, byte[]>> op1 = sets.getDataSet(info.parentID);
  DataSet<byte[]> result = op1
    .partitionByHash(info.keys.toArray(new String[info.keys.size()])).setParallelism(info.parallelism)
    .map(new KeyDiscarder<K>()).setParallelism(info.parallelism).name("HashPartitionPostStep");
  sets.add(info.setID, result);
}

/**
 * Enforces a re-balancing of the DataSet, i.e., the DataSet is evenly distributed over all parallel instances of the
 * following task. This can help to improve performance in case of heavy data skew and compute intensive operations.
 *
 * <p><b>Important:</b>This operation shuffles the whole DataSet over the network and can take significant amount of time.
 *
 * @return The re-balanced DataSet.
 */
public PartitionOperator<T> rebalance() {
  return new PartitionOperator<>(this, PartitionMethod.REBALANCE, Utils.getCallLocationName());
}

private void createRebalanceOperation(PythonOperationInfo info) {
  DataSet<?> op = sets.getDataSet(info.parentID);
  sets.add(info.setID, op.rebalance().setParallelism(info.parallelism).name("Rebalance"));
}

UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType());
PartitionOperatorBase<T> rebalancedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, name);
rebalancedInput.setInput(input);
rebalancedInput.setParallelism(getParallelism());
  UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType());
  PartitionOperatorBase<T> partitionedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, logicalKeyPositions, name);
  partitionedInput.setInput(input);
  partitionedInput.setParallelism(getParallelism());
  partitionedInput.setDistribution(distribution);
  partitionedInput.setCustomPartitioner(customPartitioner);
  partitionedInput.setOrdering(computeOrdering(pKeys, orders));
  return translateSelectorFunctionPartitioner(selectorKeys, pMethod, name, input, getParallelism(),
      customPartitioner, orders);

@Test
public void testCustomPartitioningTupleInvalidType() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    
    DataSet<Tuple2<Integer, Integer>> data = env.fromElements(new Tuple2<Integer, Integer>(0, 0))
        .rebalance().setParallelism(4);
    
    try {
      data.groupBy(0).withPartitioner(new TestPartitionerLong());
      fail("Should throw an exception");
    }
    catch (InvalidProgramException e) {}
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

@Test
public void testRangePartitionWithKeyExpression() throws Exception {
  /*
   * Test range partition with key expression
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(3);
  DataSet<POJO> ds = CollectionDataSets.getDuplicatePojoDataSet(env);
  DataSet<Long> uniqLongs = ds
    .partitionByRange("nestedPojo.longNumber").setParallelism(4)
    .mapPartition(new UniqueNestedPojoLongMapper());
  List<Long> result = uniqLongs.collect();
  String expected = "10000\n" +
    "20000\n" +
    "30000\n";
  compareResultAsText(result, expected);
}

@Test
public void reuseBothPartitioningCoGroup6() {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class);
  DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class);
  DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1
      .partitionByHash(2)
      .map(new MockMapper()).withForwardedFields("2")
      .coGroup(set2.partitionByHash(2)
          .map(new MockMapper())
          .withForwardedFields("2"))
      .where(0, 2).equalTo(1, 2).with(new MockCoGroup());
  coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>());
  Plan plan = env.createProgramPlan();
  OptimizedPlan oPlan = compileWithStats(plan);
  SinkPlanNode sink = oPlan.getDataSinks().iterator().next();
  DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource();
  checkValidCoGroupInputProperties(coGroup);
}

private void createRebalanceOperation(OperationInfo info) throws IOException {
  DataSet op = (DataSet) sets.get(info.parentID);
  sets.put(info.setID, op.rebalance().name("Rebalance"));
}

.flatMap(new RichFlatMapFunction<Long, Long>() {

@Test
public void testIncorrectSerializer4() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARLLELISM);
    env.getConfig().disableSysoutLogging();
    env
        .generateSequence(1, 10 * PARLLELISM)
        .map(new MapFunction<Long, ConsumesTooLittleSpanning>() {
          @Override
          public ConsumesTooLittleSpanning map(Long value) throws Exception {
            return new ConsumesTooLittleSpanning();
          }
        })
        .rebalance()
        .output(new DiscardingOutputFormat<ConsumesTooLittleSpanning>());
    env.execute();
  }
  catch (ProgramInvocationException e) {
    Throwable rootCause = e.getCause().getCause();
    assertTrue(rootCause instanceof IOException);
    assertTrue(rootCause.getMessage().contains("broken serialization"));
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

@Test
public void CoGroupWithDifferentDistributionTest() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class);
  DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class);
  TestDistribution testDistribution1 = new TestDistribution(3);
  TestDistribution testDistribution2 = new TestDistribution(2);
  DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = DataSetUtils.partitionByRange(set1, testDistribution1, 0)
      .coGroup(DataSetUtils.partitionByRange(set2, testDistribution2, 0))
      .where(0).equalTo(0).with(new CoGroupFunc());
  coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>());
  Plan plan = env.createProgramPlan();
  OptimizedPlan oPlan = compileWithStats(plan);
  SinkPlanNode sink = oPlan.getDataSinks().iterator().next();
  DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource();
  Channel input1 = coGroup.getInput1();
  Channel input2 = coGroup.getInput2();
  assertEquals(ShipStrategyType.PARTITION_HASH, input1.getShipStrategy());
  assertEquals(ShipStrategyType.PARTITION_HASH, input2.getShipStrategy());
}

@Test
public void testCustomPartitioningTupleInvalidType() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    DataSet<Pojo2> data = env.fromElements(new Pojo2())
        .rebalance().setParallelism(4);
    try {
      data.groupBy("a").withPartitioner(new TestPartitionerLong());
      fail("Should throw an exception");
    }
    catch (InvalidProgramException e) {}
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

.setParallelism(dop)
.name("reduce-" + node.getID());

@Test
public void testHashPartitionWithKeyExpression() throws Exception {
  /*
   * Test hash partition with key expression
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(3);
  DataSet<POJO> ds = CollectionDataSets.getDuplicatePojoDataSet(env);
  DataSet<Long> uniqLongs = ds
      .partitionByHash("nestedPojo.longNumber").setParallelism(4)
      .mapPartition(new UniqueNestedPojoLongMapper());
  List<Long> result = uniqLongs.collect();
  String expected = "10000\n" +
      "20000\n" +
      "30000\n";
  compareResultAsText(result, expected);
}

@Test
public void reuseBothPartitioningCoGroup5() {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class);
  DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class);
  DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1
      .partitionByHash(2)
      .map(new MockMapper()).withForwardedFields("2")
      .coGroup(set2.partitionByHash(1)
          .map(new MockMapper())
          .withForwardedFields("1"))
      .where(0, 2).equalTo(2, 1).with(new MockCoGroup());
  coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>());
  Plan plan = env.createProgramPlan();
  OptimizedPlan oPlan = compileWithStats(plan);
  SinkPlanNode sink = oPlan.getDataSinks().iterator().next();
  DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource();
  checkValidCoGroupInputProperties(coGroup);
}

UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType());
PartitionOperatorBase<T> rebalancedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, name);
rebalancedInput.setInput(input);
rebalancedInput.setParallelism(getParallelism());
  UnaryOperatorInformation<T, T> operatorInfo = new UnaryOperatorInformation<>(getType(), getType());
  PartitionOperatorBase<T> partitionedInput = new PartitionOperatorBase<>(operatorInfo, pMethod, logicalKeyPositions, name);
  partitionedInput.setInput(input);
  partitionedInput.setParallelism(getParallelism());
  partitionedInput.setDistribution(distribution);
  partitionedInput.setCustomPartitioner(customPartitioner);
  partitionedInput.setOrdering(computeOrdering(pKeys, orders));
  return translateSelectorFunctionPartitioner(selectorKeys, pMethod, name, input, getParallelism(),
      customPartitioner, orders);

/**
 * creates the initial working set from the edge candidates
 *
 * @return initial working set with the expand embeddings
 */
private DataSet<ExpandEmbedding> preProcess() {
 if (direction == ExpandDirection.IN) {
  candidateEdges = candidateEdges
   .map(new ReverseEdgeEmbedding())
   .name(getName() + " - Reverse Edges");
 }
 this.candidateEdgeTuples = candidateEdges
  .map(new ExtractKeyedCandidateEdges())
  .name(getName() + " - Create candidate edge tuples")
  .partitionByHash(0)
  .name(getName() + " - Partition edge tuples");
 return input.join(candidateEdgeTuples, joinHint)
  .where(new ExtractExpandColumn(expandColumn)).equalTo(0)
  .with(new CreateExpandEmbedding(
   distinctVertexColumns,
   distinctEdgeColumns,
   closingColumn
  ))
  .name(getName() + " - Initial expansion");
}

.flatMap(new ConnectedComponents.UndirectEdge());

@Test
public void testIncorrectSerializer3() {
  try {
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(PARLLELISM);
    env.getConfig().disableSysoutLogging();
    env
        .generateSequence(1, 10 * PARLLELISM)
        .map(new MapFunction<Long, ConsumesTooLittle>() {
          @Override
          public ConsumesTooLittle map(Long value) throws Exception {
            return new ConsumesTooLittle();
          }
        })
        .rebalance()
        .output(new DiscardingOutputFormat<ConsumesTooLittle>());
    env.execute();
  }
  catch (JobExecutionException e) {
    Throwable rootCause = e.getCause();
    assertTrue(rootCause instanceof IOException);
    assertTrue(rootCause.getMessage().contains("broken serialization"));
  }
  catch (Exception e) {
    e.printStackTrace();
    fail(e.getMessage());
  }
}

How to usePartitionOperator in org.apache.flink.api.java.operators

Best Java code snippets using org.apache.flink.api.java.operators.PartitionOperator (Showing top 20 results out of 315)

How to use
PartitionOperator
in
org.apache.flink.api.java.operators