org.apache.flink.api.java.operators.GroupReduceOperator java code examples

@Override
public DataSet<Tuple3<K, K, K>> run(Graph<K, VV, EV> input) throws Exception {
  DataSet<Edge<K, EV>> edges = input.getEdges();
  // annotate edges with degrees
  DataSet<EdgeWithDegrees<K>> edgesWithDegrees = edges.flatMap(new EdgeDuplicator<>())
      .groupBy(0).sortGroup(1, Order.ASCENDING).reduceGroup(new DegreeCounter<>())
      .groupBy(EdgeWithDegrees.V1, EdgeWithDegrees.V2).reduce(new DegreeJoiner<>());
  // project edges by degrees
  DataSet<Edge<K, NullValue>> edgesByDegree = edgesWithDegrees.map(new EdgeByDegreeProjector<>());
  // project edges by vertex id
  DataSet<Edge<K, NullValue>> edgesById = edgesByDegree.map(new EdgeByIdProjector<>());
  DataSet<Tuple3<K, K, K>> triangles = edgesByDegree
      // build triads
      .groupBy(EdgeWithDegrees.V1).sortGroup(EdgeWithDegrees.V2, Order.ASCENDING)
      .reduceGroup(new TriadBuilder<>())
      // filter triads
      .join(edgesById, JoinHint.REPARTITION_HASH_SECOND).where(Triad.V2, Triad.V3).equalTo(0, 1).with(new TriadFilter<>());
  return triangles;
}

.withForwardedFields("*").setParallelism(p).name("Map1")
.groupBy("*").reduceGroup(new IdentityGroupReducer<Long>())
.withForwardedFields("*").setParallelism(p).name("Reduce1")
.map(new IdentityMapper<Long>())
.withForwardedFields("*").setParallelism(p * 2).name("Map2")
.groupBy("*").reduceGroup(new IdentityGroupReducer<Long>())
.withForwardedFields("*").setParallelism(p * 2).name("Reduce2")
.output(new DiscardingOutputFormat<Long>()).setParallelism(p * 2).name("Sink");

      .withForwardedFields("f1->f0")
      .groupBy(0).reduceGroup(new ApplyGroupReduceFunction<>(edgesFunction))
        .name("GroupReduce on in-edges").returns(typeInfo);
case OUT:
  return edges.map(new ProjectVertexIdMap<>(0)).name("Vertex ID")
      .withForwardedFields("f0")
      .groupBy(0).reduceGroup(new ApplyGroupReduceFunction<>(edgesFunction))
        .name("GroupReduce on out-edges").returns(typeInfo);
case ALL:
  return edges.flatMap(new EmitOneEdgePerNode<>()).name("Emit edge")
      .groupBy(0).reduceGroup(new ApplyGroupReduceFunction<>(edgesFunction))
        .name("GroupReduce on in- and out-edges").returns(typeInfo);
default:
  throw new IllegalArgumentException("Illegal edge direction");

private <IN, OUT> DataSet<OUT> applyGroupReduceOperation(DataSet<IN> op1, PythonOperationInfo info, TypeInformation<OUT> type) {
  return op1
    .reduceGroup(new IdentityGroupReduce<IN>()).setCombinable(false).name("PythonGroupReducePreStep").setParallelism(info.parallelism)
    .mapPartition(new PythonMapPartition<IN, OUT>(operatorConfig, info.envID, info.setID, type))
    .setParallelism(info.parallelism).name(info.name);
}

private <K extends Tuple> void createFirstOperation(PythonOperationInfo info) {
  if (sets.isDataSet(info.parentID)) {
    DataSet<byte[]> op = sets.getDataSet(info.parentID);
    sets.add(info.setID, op
      .first(info.count).setParallelism(info.parallelism).name("First"));
  } else if (sets.isUnsortedGrouping(info.parentID)) {
    UnsortedGrouping<Tuple2<K, byte[]>> op = sets.getUnsortedGrouping(info.parentID);
    sets.add(info.setID, op
      .first(info.count).setParallelism(info.parallelism).name("First")
      .map(new KeyDiscarder<K>()).setParallelism(info.parallelism).name("FirstPostStep"));
  } else if (sets.isSortedGrouping(info.parentID)) {
    SortedGrouping<Tuple2<K, byte[]>> op = sets.getSortedGrouping(info.parentID);
    sets.add(info.setID, op
      .first(info.count).setParallelism(info.parallelism).name("First")
      .map(new KeyDiscarder<K>()).setParallelism(info.parallelism).name("FirstPostStep"));
  }
}

@Override
public EdgeMetrics<K, VV, EV> run(Graph<K, VV, EV> input)
    throws Exception {
  super.run(input);
  // s, t, (d(s), d(t))
  DataSet<Edge<K, Tuple3<EV, Degrees, Degrees>>> edgeDegreesPair = input
    .run(new EdgeDegreesPair<K, VV, EV>()
      .setParallelism(parallelism));
  // s, d(s), count of (u, v) where deg(u) < deg(v) or (deg(u) == deg(v) and u < v)
  DataSet<Tuple3<K, Degrees, LongValue>> edgeStats = edgeDegreesPair
    .flatMap(new EdgeStats<>())
      .setParallelism(parallelism)
      .name("Edge stats")
    .groupBy(0, 1)
    .reduceGroup(new ReduceEdgeStats<>())
      .setParallelism(parallelism)
      .name("Reduce edge stats")
    .groupBy(0)
    .reduce(new SumEdgeStats<>())
    .setCombineHint(CombineHint.HASH)
      .setParallelism(parallelism)
      .name("Sum edge stats");
  edgeMetricsHelper = new EdgeMetricsHelper<>();
  edgeStats
    .output(edgeMetricsHelper)
      .setParallelism(parallelism)
      .name("Edge metrics");
  return this;
}

  .sortGroup(1, Order.ASCENDING)
  .reduceGroup(new GenerateGroupSpans<>())
    .setParallelism(parallelism)
    .name("Generate group spans");
  .sortGroup(2, Order.ASCENDING)
  .reduceGroup(new GenerateGroupPairs<>())
    .name("Generate group pairs");
  .groupBy(0, 1)
  .reduceGroup(new ComputeScores<>(minimumScore, minimumRatio))
    .name("Compute scores");
    .withBroadcastSet(sumOfScoresAndNumberOfNeighborPairs, SUM_OF_SCORES_AND_NUMBER_OF_NEIGHBOR_PAIRS);
    .flatMap(new MirrorResult<>())
      .name("Mirror results");
} else {

GroupReduceOperator<Long, Long> reduced = data.reduceGroup(new CombineReducer2()).name("reducer");
reduced.setCombinable(true);
reduced.output(new DiscardingOutputFormat<Long>()).name("sink");

@Override
public DataSet<Vertex<K, Degrees>> runInternal(Graph<K, VV, EV> input)
    throws Exception {
  // s, t, bitmask
  DataSet<Tuple2<K, ByteValue>> vertexWithEdgeOrder = input.getEdges()
    .flatMap(new EmitAndFlipEdge<>())
      .setParallelism(parallelism)
      .name("Emit and flip edge")
    .groupBy(0, 1)
    .reduceGroup(new ReduceBitmask<>())
      .setParallelism(parallelism)
      .name("Reduce bitmask");
  // s, d(s)
  DataSet<Vertex<K, Degrees>> vertexDegrees = vertexWithEdgeOrder
    .groupBy(0)
    .reduceGroup(new DegreeCount<>())
      .setParallelism(parallelism)
      .name("Degree count");
  if (includeZeroDegreeVertices.get()) {
    vertexDegrees = input.getVertices()
      .leftOuterJoin(vertexDegrees)
      .where(0)
      .equalTo(0)
      .with(new JoinVertexWithVertexDegrees<>())
        .setParallelism(parallelism)
        .name("Zero degree vertices");
  }
  return vertexDegrees;
}

.join(edgesById).where(Triad.V2, Triad.V3).equalTo(Edge.V1, Edge.V2).with(new TriadFilter());

private DataSet<Tuple> translateGlobalGroupBy(DataSet<Tuple> input, FlowNode node, int dop,
                        String[] sortKeys, Order sortOrder, Fields outFields) {
  DataSet<Tuple> result = input;
  // sort on sorting keys if necessary
  if(sortKeys != null && sortKeys.length > 0) {
    result = result
        .sortPartition(sortKeys[0], sortOrder)
        .setParallelism(1)
        .name("reduce-"+ node.getID());
    for(int i=1; i<sortKeys.length; i++) {
      result = result
          .sortPartition(sortKeys[i], sortOrder)
          .setParallelism(1);
    }
  }
  // group all data
  return result
      .reduceGroup(new GroupByReducer(node))
      .returns(new TupleTypeInfo(outFields))
      .withParameters(this.getFlinkNodeConfig(node))
      .setParallelism(dop)
      .name("reduce-"+ node.getID());
}

.reduceGroup(new IdentityGroupReducerCombinable<Tuple3<Long,Long,Long>>()).withForwardedFields("0", "1", "2")
.groupBy(0)
.reduceGroup(new IdentityGroupReducerCombinable<Tuple3<Long,Long,Long>>())
.output(new DiscardingOutputFormat<Tuple3<Long, Long, Long>>());

.withForwardedFields("*").name(NEXT_WORKSET_REDUCER_NAME);

.sortGroup(1, Order.ASCENDING)
.reduceGroup(new GenerateGroupSpans<K>())
  .setParallelism(littleParallelism)
  .name("Generate group spans");
.sortGroup(2, Order.ASCENDING)
.reduceGroup(new GenerateGroupPairs<K>())
  .name("Generate group pairs");
.groupBy(0, 1)
.reduceGroup(new ComputeScores<K>(minimumScore, minimumRatio))
  .name("Compute scores");
  .withBroadcastSet(sumOfScoresAndNumberOfNeighborPairs, SUM_OF_SCORES_AND_NUMBER_OF_NEIGHBOR_PAIRS);

/**
 * Applies a GroupReduce transformation on a non-grouped {@link DataSet}.
 *
 * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichGroupReduceFunction} once with the full DataSet.
 * The GroupReduceFunction can iterate over all elements of the DataSet and emit any
 *   number of output elements including none.
 *
 * @param reducer The GroupReduceFunction that is applied on the DataSet.
 * @return A GroupReduceOperator that represents the reduced DataSet.
 *
 * @see org.apache.flink.api.common.functions.RichGroupReduceFunction
 * @see org.apache.flink.api.java.operators.GroupReduceOperator
 * @see DataSet
 */
public <R> GroupReduceOperator<T, R> reduceGroup(GroupReduceFunction<T, R> reducer) {
  if (reducer == null) {
    throw new NullPointerException("GroupReduce function must not be null.");
  }
  String callLocation = Utils.getCallLocationName();
  TypeInformation<R> resultType = TypeExtractor.getGroupReduceReturnTypes(reducer, getType(), callLocation, true);
  return new GroupReduceOperator<>(this, resultType, clean(reducer), callLocation);
}

@Test
public void testWithKryoGenericSer() throws Exception {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.getConfig().enableForceKryo();
  Path in = new Path(inFile.getAbsoluteFile().toURI());
  AvroInputFormat<User> users = new AvroInputFormat<>(in, User.class);
  DataSet<User> usersDS = env.createInput(users);
  DataSet<Tuple2<String, Integer>> res = usersDS
    .groupBy((KeySelector<User, String>) value -> String.valueOf(value.getName()))
    .reduceGroup((GroupReduceFunction<User, Tuple2<String, Integer>>) (values, out) -> {
      for (User u : values) {
        out.collect(new Tuple2<>(u.getName().toString(), 1));
      }
    })
    .returns(Types.TUPLE(Types.STRING, Types.INT));
  res.writeAsText(resultPath);
  env.execute("Avro Key selection");
  expected = "(Charlie,1)\n(Alyssa,1)\n";
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  env.setParallelism(4);
  DataSet<String> initialInput = env.fromElements("1", "2", "3", "4", "5").name("input");
  IterativeDataSet<String> iteration = initialInput.iterate(5).name("Loop");
  DataSet<String> sumReduce = iteration.reduceGroup(new SumReducer()).name("Compute sum (GroupReduce");
  DataSet<String> terminationFilter = iteration.filter(new TerminationFilter()).name("Compute termination criterion (Map)");
  List<String> result = iteration.closeWith(sumReduce, terminationFilter).collect();
  containsResultAsText(result, EXPECTED);
}

@Test
public void testSemanticPropsWithKeySelector5() {
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple5<Integer, Long, String, Long, Integer>> tupleDs = env.fromCollection(emptyTupleData, tupleTypeInfo);
  GroupReduceOperator<Tuple5<Integer, Long, String, Long, Integer>, Tuple5<Integer, Long, String, Long, Integer>> reduceOp =
      tupleDs.groupBy(new DummyTestKeySelector())
          .reduceGroup(new DummyGroupReduceFunction3())
          .withForwardedFields("4->0;3;3->1;2");
  SemanticProperties semProps = reduceOp.getSemanticProperties();
  assertTrue(semProps.getForwardingTargetFields(0, 0).size() == 0);
  assertTrue(semProps.getForwardingTargetFields(0, 1).size() == 0);
  assertTrue(semProps.getForwardingTargetFields(0, 2).size() == 0);
  assertTrue(semProps.getForwardingTargetFields(0, 3).size() == 0);
  assertTrue(semProps.getForwardingTargetFields(0, 4).size() == 1);
  assertTrue(semProps.getForwardingTargetFields(0, 4).contains(2));
  assertTrue(semProps.getForwardingTargetFields(0, 5).size() == 2);
  assertTrue(semProps.getForwardingTargetFields(0, 5).contains(1));
  assertTrue(semProps.getForwardingTargetFields(0, 5).contains(3));
  assertTrue(semProps.getForwardingTargetFields(0, 6).size() == 1);
  assertTrue(semProps.getForwardingTargetFields(0, 6).contains(0));
  assertTrue(semProps.getForwardingSourceField(0, 0) == 6);
  assertTrue(semProps.getForwardingSourceField(0, 1) == 5);
  assertTrue(semProps.getForwardingSourceField(0, 2) == 4);
  assertTrue(semProps.getForwardingSourceField(0, 3) == 5);
  assertTrue(semProps.getForwardingSourceField(0, 4) < 0);
  assertTrue(semProps.getReadFields(0) == null);
}

@Override
protected void testProgram() throws Exception {
  ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Integer> data = env.fromElements(1, 2, 3, 4, 5, 6, 7, 8);
  IterativeDataSet<Integer> iteration = data.iterate(10);
  DataSet<Integer> result = data.reduceGroup(new PickOneAllReduce()).withBroadcastSet(iteration, "bc");
  final List<Integer> resultList = new ArrayList<Integer>();
  iteration.closeWith(result).output(new LocalCollectionOutputFormat<Integer>(resultList));
  env.execute();
  Assert.assertEquals(8, resultList.get(0).intValue());
}

@Test
public void testFirstNOnUngroupedDS() throws Exception {
  /*
   * First-n on ungrouped data set
   */
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
  DataSet<Tuple1<Integer>> seven = ds.first(7).map(new OneMapper()).sum(0);
  List<Tuple1<Integer>> result = seven.collect();
  String expected = "(7)\n";
  compareResultAsText(result, expected);
}

Javadoc

This operator represents the application of a "reduceGroup" function on a data set, and the result data set produced by the function.

Most used methods

join
name
returns
withForwardedFields
setParallelism
<init>
Constructor for a grouped reduce.
groupBy
map
withBroadcastSet
setCombinable
output
checkCombinability

Popular in Java

Finding current android device location
startActivity (Activity)
notifyDataSetChanged (ArrayAdapter)
requestLocationUpdates (LocationManager)
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
GridLayout (java.awt)
The GridLayout class is a layout manager that lays out a container's components in a rectangular gri
Notification (javax.management)
JFrame (javax.swing)
Top plugins for Android Studio

How to useGroupReduceOperator in org.apache.flink.api.java.operators

Best Java code snippets using org.apache.flink.api.java.operators.GroupReduceOperator (Showing top 20 results out of 315)

How to use
GroupReduceOperator
in
org.apache.flink.api.java.operators