@Test public void noPreviousPartitioningCoGroup2() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .coGroup(set2) .where(0,1).equalTo(2,1).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }
@Test public void noPreviousPartitioningCoGroup1() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .coGroup(set2) .where(0,1).equalTo(0,1).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }
@Test public void reuseSinglePartitioningCoGroup4() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .partitionByHash(0) .map(new MockMapper()).withForwardedFields("0") .coGroup(set2) .where(0, 1).equalTo(2, 1).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }
/** * Removes the given list of vertices and its edges from the graph. * * @param verticesToBeRemoved the DataSet of vertices to be removed * @return the resulted graph containing the initial vertices and edges minus the vertices * and edges removed. */ private Graph<K, VV, EV> removeVertices(DataSet<Vertex<K, VV>> verticesToBeRemoved) { DataSet<Vertex<K, VV>> newVertices = getVertices().coGroup(verticesToBeRemoved).where(0).equalTo(0) .with(new VerticesRemovalCoGroup<>()).name("Remove vertices"); DataSet <Edge< K, EV>> newEdges = newVertices.join(getEdges()).where(0).equalTo(0) // if the edge source was removed, the edge will also be removed .with(new ProjectEdgeToBeRemoved<>()).name("Edges to be removed") // if the edge target was removed, the edge will also be removed .join(newVertices).where(1).equalTo(0) .with(new ProjectEdge<>()).name("Remove edges"); return new Graph<>(newVertices, newEdges, context); }
@Test public void testCorrectnessOfCoGroupIfUDFReturnsRightInputObjects() throws Exception { /* * check correctness of cogroup if UDF returns right input objects */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> coGroupDs = ds.coGroup(ds2).where(0).equalTo(0).with(new Tuple5ReturnRight()); List<Tuple5<Integer, Long, Integer, String, Long>> result = coGroupDs.collect(); String expected = "1,1,0,Hallo,1\n" + "2,2,1,Hallo Welt,2\n" + "2,3,2,Hallo Welt wie,1\n" + "3,4,3,Hallo Welt wie gehts?,2\n" + "3,5,4,ABC,2\n" + "3,6,5,BCD,3\n"; compareResultAsTuples(result, expected); }
@Test public void testCoGroupTwoCustomTypeInputsWithExpressionKeys() throws Exception { /* * CoGroup on two custom type inputs using expression keys */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CustomType> ds = CollectionDataSets.getCustomTypeDataSet(env); DataSet<CustomType> ds2 = CollectionDataSets.getCustomTypeDataSet(env); DataSet<CustomType> coGroupDs = ds.coGroup(ds2).where("myInt").equalTo("myInt").with(new CustomTypeCoGroup()); List<CustomType> result = coGroupDs.collect(); String expected = "1,0,test\n" + "2,6,test\n" + "3,24,test\n" + "4,60,test\n" + "5,120,test\n" + "6,210,test\n"; compareResultAsText(result, expected); }
@Test public void reuseSinglePartitioningCoGroup3() { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Integer, Integer>> set1 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> set2 = env.readCsvFile(IN_FILE).types(Integer.class, Integer.class, Integer.class); DataSet<Tuple3<Integer, Integer, Integer>> coGrouped = set1 .coGroup(set2.partitionByHash(2, 1) .map(new MockMapper()) .withForwardedFields("2;1")) .where(0,1).equalTo(2, 1).with(new MockCoGroup()); coGrouped.output(new DiscardingOutputFormat<Tuple3<Integer, Integer, Integer>>()); Plan plan = env.createProgramPlan(); OptimizedPlan oPlan = compileWithStats(plan); SinkPlanNode sink = oPlan.getDataSinks().iterator().next(); DualInputPlanNode coGroup= (DualInputPlanNode)sink.getInput().getSource(); checkValidCoGroupInputProperties(coGroup); }
@Test public void testCoGroupFieldSelectorAndKeySelector() throws Exception { /* * CoGroup field-selector (expression keys) + key selector function * The key selector is simple here */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<POJO> ds = CollectionDataSets.getSmallPojoDataSet(env); DataSet<Tuple7<Integer, String, Integer, Integer, Long, String, Long>> ds2 = CollectionDataSets.getSmallTuplebasedDataSet(env); DataSet<CustomType> coGroupDs = ds.coGroup(ds2) .where(new KeySelector1()).equalTo(6).with(new CoGroup2()); List<CustomType> result = coGroupDs.collect(); String expected = "-1,20000,Flink\n" + "-1,10000,Flink\n" + "-1,30000,Flink\n"; compareResultAsText(result, expected); }
public static void connectedComponentsWithCoGroup(String[] args) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(Integer.parseInt(args[0])); DataSet<Tuple1<Long>> initialVertices = env.readCsvFile(args[1]).types(Long.class).name(VERTEX_SOURCE); DataSet<Tuple2<Long, Long>> edges = env.readCsvFile(args[2]).types(Long.class, Long.class).name(EDGES_SOURCE); DataSet<Tuple2<Long, Long>> verticesWithId = initialVertices.flatMap(new DummyMapFunction()); DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration = verticesWithId.iterateDelta(verticesWithId, Integer.parseInt(args[4]), 0).name(ITERATION_NAME); DataSet<Tuple2<Long, Long>> joinWithNeighbors = iteration.getWorkset().join(edges) .where(0).equalTo(0) .with(new DummyJoinFunction()).name(JOIN_NEIGHBORS_MATCH); DataSet<Tuple2<Long, Long>> minAndUpdate = joinWithNeighbors.coGroup(iteration.getSolutionSet()) .where(0).equalTo(0) .with(new DummyCoGroupFunction()).name(MIN_ID_AND_UPDATE); iteration.closeWith(minAndUpdate, minAndUpdate).writeAsCsv(args[3]).name(SINK); env.execute(); }
@Test public void testCorrectnessOfCoGroupIfUDFReturnsLeftInputObjects() throws Exception { /* * check correctness of cogroup if UDF returns left input objects */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.get3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> coGroupDs = ds.coGroup(ds2).where(0).equalTo(0).with(new Tuple3ReturnLeft()); List<Tuple3<Integer, Long, String>> result = coGroupDs.collect(); String expected = "1,1,Hi\n" + "2,2,Hello\n" + "3,2,Hello world\n" + "4,3,Hello world, how are you?\n" + "5,3,I am fine.\n"; compareResultAsTuples(result, expected); }
@Test public void testCoGroupFieldSelectorAndComplicatedKeySelector() throws Exception { /* * CoGroup field-selector (expression keys) + key selector function * The key selector is unnecessary complicated (Tuple1) ;) */ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<POJO> ds = CollectionDataSets.getSmallPojoDataSet(env); DataSet<Tuple7<Integer, String, Integer, Integer, Long, String, Long>> ds2 = CollectionDataSets.getSmallTuplebasedDataSet(env); DataSet<CustomType> coGroupDs = ds.coGroup(ds2) .where(new KeySelector6()).equalTo(6).with(new CoGroup3()); List<CustomType> result = coGroupDs.collect(); String expected = "-1,20000,Flink\n" + "-1,10000,Flink\n" + "-1,30000,Flink\n"; compareResultAsText(result, expected); }
/** * Computes the intersection between the edge set and the given edge set. For all matching pairs, both edges will be * in the resulting data set. * * @param edges edges to compute intersection with * @return edge set containing both edges from all matching pairs of the same edge */ private DataSet<Edge<K, EV>> getPairwiseEdgeIntersection(DataSet<Edge<K, EV>> edges) { return this.getEdges() .coGroup(edges) .where(0, 1, 2) .equalTo(0, 1, 2) .with(new MatchingEdgeReducer<>()) .name("Intersect edges"); }
/** * Checks that the edge set input contains valid vertex Ids, i.e. that they * also exist in the vertex input set. * * @return a boolean stating whether a graph is valid * with respect to its vertex ids. */ @Override public boolean validate(Graph<K, VV, EV> graph) throws Exception { DataSet<Tuple1<K>> edgeIds = graph.getEdges() .flatMap(new MapEdgeIds<>()).distinct(); DataSet<K> invalidIds = graph.getVertices().coGroup(edgeIds).where(0) .equalTo(0).with(new GroupInvalidIds<>()).first(1); return invalidIds.map(new KToTupleMap<>()).count() == 0; }
/** * Adds the list of vertices, passed as input, to the graph. * If the vertices already exist in the graph, they will not be added once more. * * @param verticesToAdd the list of vertices to add * @return the new graph containing the existing and newly added vertices */ public Graph<K, VV, EV> addVertices(List<Vertex<K, VV>> verticesToAdd) { // Add the vertices DataSet<Vertex<K, VV>> newVertices = this.vertices.coGroup(this.context.fromCollection(verticesToAdd)) .where(0).equalTo(0).with(new VerticesUnionCoGroup<>()).name("Add vertices"); return new Graph<>(newVertices, this.edges, this.context); }
@Test public void testCoGroupTuplesWithKeyFieldSelector() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple5<Integer, Long, Integer, String, Long>> ds2 = CollectionDataSets.get5TupleDataSet(env); DataSet<Tuple2<Integer, Integer>> coGroupDs = ds.coGroup(ds2).where(0).equalTo(0).with(new Tuple5CoGroup()); List<Tuple2<Integer, Integer>> result = coGroupDs.collect(); String expected = "1,0\n" + "2,6\n" + "3,24\n" + "4,60\n" + "5,120\n"; compareResultAsTuples(result, expected); }
/** * Removes all the edges that match the edges in the given data set from the graph. * * @param edgesToBeRemoved the list of edges to be removed * @return a new graph where the edges have been removed and in which the vertices remained intact */ public Graph<K, VV, EV> removeEdges(List<Edge<K, EV>> edgesToBeRemoved) { DataSet<Edge<K, EV>> newEdges = getEdges().coGroup(this.context.fromCollection(edgesToBeRemoved)) .where(0, 1).equalTo(0, 1).with(new EdgeRemovalCoGroup<>()).name("Remove edges"); return new Graph<>(this.vertices, newEdges, context); }
/** * Return the in-degree of all vertices in the graph. * * @return A DataSet of {@code Tuple2<vertexId, inDegree>} */ public DataSet<Tuple2<K, LongValue>> inDegrees() { return vertices.coGroup(edges).where(0).equalTo(1).with(new CountNeighborsCoGroup<>()) .name("In-degree"); }
@Test public void testCoGroupWithAtomicType2() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds1 = env.fromElements(0, 1, 2); DataSet<Tuple3<Integer, Long, String>> ds2 = CollectionDataSets.getSmall3TupleDataSet(env); DataSet<Tuple3<Integer, Long, String>> coGroupDs = ds1.coGroup(ds2).where("*").equalTo(0).with(new CoGroupAtomic2()); List<Tuple3<Integer, Long, String>> result = coGroupDs.collect(); String expected = "(1,1,Hi)\n" + "(2,2,Hello)"; compareResultAsText(result, expected); }
@Test public void testCoGroupWithAtomicType1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds1 = CollectionDataSets.getSmall3TupleDataSet(env); DataSet<Integer> ds2 = env.fromElements(0, 1, 2); DataSet<Tuple3<Integer, Long, String>> coGroupDs = ds1.coGroup(ds2).where(0).equalTo("*").with(new CoGroupAtomic1()); List<Tuple3<Integer, Long, String>> result = coGroupDs.collect(); String expected = "(1,1,Hi)\n" + "(2,2,Hello)"; compareResultAsText(result, expected); }
/** * Return the out-degree of all vertices in the graph. * * @return A DataSet of {@code Tuple2<vertexId, outDegree>} */ public DataSet<Tuple2<K, LongValue>> outDegrees() { return vertices.coGroup(edges).where(0).equalTo(0).with(new CountNeighborsCoGroup<>()) .name("Out-degree"); }