private Plan getWordCountPlan(File inFile, File outFile, int parallelism) { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(parallelism); env.readTextFile(inFile.getAbsolutePath()) .flatMap(new Tokenizer()) .groupBy(0) .sum(1) .writeAsCsv(outFile.getAbsolutePath()); return env.createProgramPlan(); } }
@Test public void testFlatMapWithClassTypeHint() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.getConfig().disableSysoutLogging(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.getSmall3TupleDataSet(env); DataSet<Integer> identityMapDs = ds .flatMap(new FlatMapper<Tuple3<Integer, Long, String>, Integer>()) .returns(Integer.class); List<Integer> result = identityMapDs.collect(); String expectedResult = "2\n" + "3\n" + "1\n"; compareResultAsText(result, expectedResult); }
@Override public Graph<LongValue, NullValue, NullValue> generate() { // Vertices DataSet<Vertex<LongValue, NullValue>> vertices = GraphGeneratorUtils.vertexSequence(env, parallelism, vertexCount); // Edges LongValueSequenceIterator iterator = new LongValueSequenceIterator(0, this.vertexCount - 1); // Validate ranges Collections.sort(offsetRanges); Iterator<OffsetRange> iter = offsetRanges.iterator(); OffsetRange lastRange = iter.next(); while (iter.hasNext()) { OffsetRange nextRange = iter.next(); if (lastRange.overlaps(nextRange)) { throw new IllegalArgumentException("Overlapping ranges " + lastRange + " and " + nextRange); } lastRange = nextRange; } DataSet<Edge<LongValue, NullValue>> edges = env .fromParallelCollection(iterator, LongValue.class) .setParallelism(parallelism) .name("Edge iterators") .flatMap(new LinkVertexToOffsets(vertexCount, offsetRanges)) .setParallelism(parallelism) .name("Circulant graph edges"); // Graph return Graph.fromDataSet(vertices, edges, env); }
/** * Creates a graph from a DataSet of edges. * Vertices are created automatically and their values are set to * NullValue. * * @param edges a DataSet of edges. * @param context the flink execution environment. * @return the newly created graph. */ public static <K, EV> Graph<K, NullValue, EV> fromDataSet( DataSet<Edge<K, EV>> edges, ExecutionEnvironment context) { DataSet<Vertex<K, NullValue>> vertices = edges .flatMap(new EmitSrcAndTarget<>()) .name("Source and target IDs") .distinct() .name("IDs"); return new Graph<>(vertices, edges, context); }
@Override public Graph<K, VV, EV> runInternal(Graph<K, VV, EV> input) throws Exception { // Edges DataSet<Edge<K, EV>> edges = input .getEdges() .flatMap(new SymmetrizeAndRemoveSelfLoops<>(clipAndFlip)) .setParallelism(parallelism) .name("Remove self-loops") .distinct(0, 1) .setCombineHint(CombineHint.NONE) .setParallelism(parallelism) .name("Remove duplicate edges"); // Graph return Graph.fromDataSet(input.getVertices(), edges, input.getContext()); }
@Override public void translateNode(Window.Assign<T> transform, FlinkBatchTranslationContext context) { PValue input = context.getInput(transform); TypeInformation<WindowedValue<T>> resultTypeInfo = context.getTypeInfo(context.getOutput(transform)); DataSet<WindowedValue<T>> inputDataSet = context.getInputDataSet(input); @SuppressWarnings("unchecked") final WindowingStrategy<T, ? extends BoundedWindow> windowingStrategy = (WindowingStrategy<T, ? extends BoundedWindow>) context.getOutput(transform).getWindowingStrategy(); WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn(); FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction = new FlinkAssignWindows<>(windowFn); DataSet<WindowedValue<T>> resultDataSet = inputDataSet .flatMap(assignWindowsFunction) .name(context.getOutput(transform).getName()) .returns(resultTypeInfo); context.setOutputDataSet(context.getOutput(transform), resultDataSet); } }
@Override public EdgeMetrics<K, VV, EV> run(Graph<K, VV, EV> input) throws Exception { super.run(input); // s, t, (d(s), d(t)) DataSet<Edge<K, Tuple3<EV, Degrees, Degrees>>> edgeDegreesPair = input .run(new EdgeDegreesPair<K, VV, EV>() .setParallelism(parallelism)); // s, d(s), count of (u, v) where deg(u) < deg(v) or (deg(u) == deg(v) and u < v) DataSet<Tuple3<K, Degrees, LongValue>> edgeStats = edgeDegreesPair .flatMap(new EdgeStats<>()) .setParallelism(parallelism) .name("Edge stats") .groupBy(0, 1) .reduceGroup(new ReduceEdgeStats<>()) .setParallelism(parallelism) .name("Reduce edge stats") .groupBy(0) .reduce(new SumEdgeStats<>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum edge stats"); edgeMetricsHelper = new EdgeMetricsHelper<>(); edgeStats .output(edgeMetricsHelper) .setParallelism(parallelism) .name("Edge metrics"); return this; }
case ALL: return edges.flatMap(new EmitOneVertexWithEdgeValuePerNode<>()) .withForwardedFields("f2->f1") .name("Vertex with all edges") .groupBy(0).reduce(new ApplyReduceFunction<>(reduceEdgesFunction)) .name("Reduce on edges"); default:
.name(operator.getName() + "::map-input") .setParallelism(inputParallelism) .returns(new TypeHint<BatchElement<Window, Pair>>() {});
/** * This operation adds all inverse-direction edges to the graph. * * @return the undirected graph. */ public Graph<K, VV, EV> getUndirected() { DataSet<Edge<K, EV>> undirectedEdges = edges. flatMap(new RegularAndReversedEdgesMap<>()).name("To undirected graph"); return new Graph<>(vertices, undirectedEdges, this.context); }
/** * Applies a FlatMap transformation on a {@link DataSet}. * * <p>The transformation calls a {@link org.apache.flink.api.common.functions.RichFlatMapFunction} for each element of the DataSet. * Each FlatMapFunction call can return any number of elements including none. * * @param flatMapper The FlatMapFunction that is called for each element of the DataSet. * @return A FlatMapOperator that represents the transformed DataSet. * * @see org.apache.flink.api.common.functions.RichFlatMapFunction * @see FlatMapOperator * @see DataSet */ public <R> FlatMapOperator<T, R> flatMap(FlatMapFunction<T, R> flatMapper) { if (flatMapper == null) { throw new NullPointerException("FlatMap function must not be null."); } String callLocation = Utils.getCallLocationName(); TypeInformation<R> resultType = TypeExtractor.getFlatMapReturnTypes(flatMapper, getType(), callLocation, true); return new FlatMapOperator<>(this, resultType, clean(flatMapper), callLocation); }
/** * Checks that the edge set input contains valid vertex Ids, i.e. that they * also exist in the vertex input set. * * @return a boolean stating whether a graph is valid * with respect to its vertex ids. */ @Override public boolean validate(Graph<K, VV, EV> graph) throws Exception { DataSet<Tuple1<K>> edgeIds = graph.getEdges() .flatMap(new MapEdgeIds<>()).distinct(); DataSet<K> invalidIds = graph.getVertices().coGroup(edgeIds).where(0) .equalTo(0).with(new GroupInvalidIds<>()).first(1); return invalidIds.map(new KToTupleMap<>()).count() == 0; }
.join(this.vertices).where(1).equalTo(0) .with(new ProjectNeighborValue<>()).name("Vertex with neighbor value");
res.print();
@Override public void translateNode( PTransform<PCollection<T>, PCollection<T>> transform, FlinkBatchTranslationContext context) { PValue input = context.getInput(transform); TypeInformation<WindowedValue<T>> resultTypeInfo = context.getTypeInfo(context.getOutput(transform)); DataSet<WindowedValue<T>> inputDataSet = context.getInputDataSet(input); @SuppressWarnings("unchecked") final WindowingStrategy<T, ? extends BoundedWindow> windowingStrategy = (WindowingStrategy<T, ? extends BoundedWindow>) context.getOutput(transform).getWindowingStrategy(); WindowFn<T, ? extends BoundedWindow> windowFn = windowingStrategy.getWindowFn(); FlinkAssignWindows<T, ? extends BoundedWindow> assignWindowsFunction = new FlinkAssignWindows<>(windowFn); DataSet<WindowedValue<T>> resultDataSet = inputDataSet .flatMap(assignWindowsFunction) .name(context.getOutput(transform).getName()) .returns(resultTypeInfo); context.setOutputDataSet(context.getOutput(transform), resultDataSet); } }
@Override public DataSet<Vertex<K, Degrees>> runInternal(Graph<K, VV, EV> input) throws Exception { // s, t, bitmask DataSet<Tuple2<K, ByteValue>> vertexWithEdgeOrder = input.getEdges() .flatMap(new EmitAndFlipEdge<>()) .setParallelism(parallelism) .name("Emit and flip edge") .groupBy(0, 1) .reduceGroup(new ReduceBitmask<>()) .setParallelism(parallelism) .name("Reduce bitmask"); // s, d(s) DataSet<Vertex<K, Degrees>> vertexDegrees = vertexWithEdgeOrder .groupBy(0) .reduceGroup(new DegreeCount<>()) .setParallelism(parallelism) .name("Degree count"); if (includeZeroDegreeVertices.get()) { vertexDegrees = input.getVertices() .leftOuterJoin(vertexDegrees) .where(0) .equalTo(0) .with(new JoinVertexWithVertexDegrees<>()) .setParallelism(parallelism) .name("Zero degree vertices"); } return vertexDegrees; }
@Override public Graph<K, VV, EV> runInternal(Graph<K, VV, EV> input) throws Exception { // Edges DataSet<Edge<K, EV>> edges = input .getEdges() .flatMap(new SymmetrizeAndRemoveSelfLoops<K, EV>(clipAndFlip)) .setParallelism(parallelism) .name("Remove self-loops") .distinct(0, 1) .setParallelism(parallelism) .name("Remove duplicate edges"); // Graph return Graph.fromDataSet(input.getVertices(), edges, input.getContext()); }