@Override public void emitDataSet(DataSet<Row> dataSet) { dataSet .output(new Utils.CollectHelper<>(accumulatorName, serializer)) .name("SQL Client Batch Collect Sink"); }
private void createTextSink(PythonOperationInfo info) { DataSet<byte[]> parent = sets.getDataSet(info.parentID); parent.map(new StringDeserializerMap()).setParallelism(info.parallelism) .writeAsText(info.path, info.writeMode).setParallelism(info.parallelism).name("TextSink"); }
/** * Emits a DataSet using an {@link OutputFormat}. This method adds a data sink to the program. * Programs may have multiple data sinks. A DataSet may also have multiple consumers (data sinks * or transformations) at the same time. * * @param outputFormat The OutputFormat to process the DataSet. * @return The DataSink that processes the DataSet. * * @see OutputFormat * @see DataSink */ public DataSink<T> output(OutputFormat<T> outputFormat) { Preconditions.checkNotNull(outputFormat); // configure the type if needed if (outputFormat instanceof InputTypeConfigurable) { ((InputTypeConfigurable) outputFormat).setInputType(getType(), context.getConfig()); } DataSink<T> sink = new DataSink<>(this, outputFormat, getType()); this.context.registerDataSink(sink); return sink; }
private <T> GenericDataSinkBase<T> translate(DataSink<T> sink) { // translate the input recursively Operator<T> input = translate(sink.getDataSet()); // translate the sink itself and connect it to the input GenericDataSinkBase<T> translatedSink = sink.translateToDataFlow(input); translatedSink.setResources(sink.getMinResources(), sink.getPreferredResources()); return translatedSink; }
@Test public void testPojoSortingNestedParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CollectionDataSets.POJO> ds = CollectionDataSets.getMixedPojoDataSet(env); ds.writeAsText(resultPath) .sortLocalOutput("nestedTupleWithCustom.f0", Order.ASCENDING) .sortLocalOutput("nestedTupleWithCustom.f1.myInt", Order.DESCENDING) .sortLocalOutput("nestedPojo.longNumber", Order.ASCENDING) .setParallelism(1); env.execute(); String expected = "2 First_ (10,105,1000,One) 10200\n" + "1 First (10,100,1000,One) 10100\n" + "4 First_ (11,106,1000,One) 10300\n" + "5 First (11,102,2000,One) 10100\n" + "3 First (11,102,3000,One) 10200\n" + "6 Second_ (20,200,2000,Two) 10100\n" + "8 Third_ (30,300,1000,Three) 10100\n" + "7 Third (31,301,2000,Three) 10200\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
private void createPrintSink(PythonOperationInfo info) { DataSet<byte[]> parent = sets.getDataSet(info.parentID); parent.map(new StringDeserializerMap()).setParallelism(info.parallelism).name("PrintSinkPreStep") .output(new PrintingOutputFormat<String>(info.toError)).setParallelism(info.parallelism); }
.name(tap.getIdentifier()) .setParallelism(dop) .withParameters(FlinkConfigConverter.toFlinkConfig(sinkConfig));
@Test public void testTupleSortingNestedParallelism1_2() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Tuple2<Integer, Integer>, String, Integer>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet2(env); ds.writeAsText(resultPath) .sortLocalOutput(1, Order.ASCENDING) .sortLocalOutput(2, Order.DESCENDING) .setParallelism(1); env.execute(); String expected = "((2,1),a,3)\n" + "((1,3),a,2)\n" + "((1,2),a,1)\n" + "((2,2),b,4)\n" + "((4,9),c,7)\n" + "((3,6),c,6)\n" + "((3,3),c,5)\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
private <T> GenericDataSinkBase<T> translate(DataSink<T> sink) { // translate the input recursively Operator<T> input = translate(sink.getDataSet()); // translate the sink itself and connect it to the input GenericDataSinkBase<T> translatedSink = sink.translateToDataFlow(input); translatedSink.setResources(sink.getMinResources(), sink.getPreferredResources()); return translatedSink; }
@Override public Count<T> run(DataSet<T> input) throws Exception { super.run(input); countHelper = new CountHelper<>(); input .output(countHelper) .name("Count"); return this; }
private void createCsvSink(PythonOperationInfo info) { DataSet<byte[]> parent = sets.getDataSet(info.parentID); parent.map(new StringTupleDeserializerMap()).setParallelism(info.parallelism).name("CsvSinkPreStep") .writeAsCsv(info.path, info.lineDelimiter, info.fieldDelimiter, info.writeMode).setParallelism(info.parallelism).name("CsvSink"); }
@Test public void testPojoSortingDualParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CollectionDataSets.POJO> ds = CollectionDataSets.getMixedPojoDataSet(env); ds.writeAsText(resultPath) .sortLocalOutput("str", Order.ASCENDING) .sortLocalOutput("number", Order.DESCENDING) .setParallelism(1); env.execute(); String expected = "5 First (11,102,2000,One) 10100\n" + "3 First (11,102,3000,One) 10200\n" + "1 First (10,100,1000,One) 10100\n" + "4 First_ (11,106,1000,One) 10300\n" + "2 First_ (10,105,1000,One) 10200\n" + "6 Second_ (20,200,2000,Two) 10100\n" + "7 Third (31,301,2000,Three) 10200\n" + "8 Third_ (30,300,1000,Three) 10100\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
.writeAsText(outputPath, FileSystem.WriteMode.OVERWRITE).setParallelism(1);
private <T> GenericDataSinkBase<T> translate(DataSink<T> sink) { // translate the input recursively Operator<T> input = translate(sink.getDataSet()); // translate the sink itself and connect it to the input GenericDataSinkBase<T> translatedSink = sink.translateToDataFlow(input); translatedSink.setResources(sink.getMinResources(), sink.getPreferredResources()); return translatedSink; }
/** * Emits a DataSet using an {@link OutputFormat}. This method adds a data sink to the program. * Programs may have multiple data sinks. A DataSet may also have multiple consumers (data sinks * or transformations) at the same time. * * @param outputFormat The OutputFormat to process the DataSet. * @return The DataSink that processes the DataSet. * * @see OutputFormat * @see DataSink */ public DataSink<T> output(OutputFormat<T> outputFormat) { Preconditions.checkNotNull(outputFormat); // configure the type if needed if (outputFormat instanceof InputTypeConfigurable) { ((InputTypeConfigurable) outputFormat).setInputType(getType(), context.getConfig()); } DataSink<T> sink = new DataSink<>(this, outputFormat, getType()); this.context.registerDataSink(sink); return sink; }
@Override public ChecksumHashCode<T> run(DataSet<T> input) throws Exception { super.run(input); checksumHashCodeHelper = new ChecksumHashCodeHelper<>(); input .output(checksumHashCodeHelper) .name("ChecksumHashCode"); return this; }
@Override public EdgeMetrics<K, VV, EV> run(Graph<K, VV, EV> input) throws Exception { super.run(input); // s, t, (d(s), d(t)) DataSet<Edge<K, Tuple3<EV, LongValue, LongValue>>> edgeDegreePair = input .run(new EdgeDegreePair<K, VV, EV>() .setReduceOnTargetId(reduceOnTargetId) .setParallelism(parallelism)); // s, d(s), count of (u, v) where deg(u) < deg(v) or (deg(u) == deg(v) and u < v) DataSet<Tuple3<K, LongValue, LongValue>> edgeStats = edgeDegreePair .map(new EdgeStats<>()) .setParallelism(parallelism) .name("Edge stats") .groupBy(0) .reduce(new SumEdgeStats<>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum edge stats"); edgeMetricsHelper = new EdgeMetricsHelper<>(); edgeStats .output(edgeMetricsHelper) .setParallelism(parallelism) .name("Edge metrics"); return this; }
@Test public void testTupleSortingNestedParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Tuple2<Integer, Integer>, String, Integer>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet2(env); ds.writeAsText(resultPath) .sortLocalOutput("f0.f1", Order.ASCENDING) .sortLocalOutput("f1", Order.DESCENDING) .setParallelism(1); env.execute(); String expected = "((2,1),a,3)\n" + "((2,2),b,4)\n" + "((1,2),a,1)\n" + "((3,3),c,5)\n" + "((1,3),a,2)\n" + "((3,6),c,6)\n" + "((4,9),c,7)\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
.output(new DiscardingOutputFormat<Long>()).setParallelism(5);