private void createTextSink(PythonOperationInfo info) { DataSet<byte[]> parent = sets.getDataSet(info.parentID); parent.map(new StringDeserializerMap()).setParallelism(info.parallelism) .writeAsText(info.path, info.writeMode).setParallelism(info.parallelism).name("TextSink"); }
private void createPrintSink(PythonOperationInfo info) { DataSet<byte[]> parent = sets.getDataSet(info.parentID); parent.map(new StringDeserializerMap()).setParallelism(info.parallelism).name("PrintSinkPreStep") .output(new PrintingOutputFormat<String>(info.toError)).setParallelism(info.parallelism); }
private void createCsvSink(PythonOperationInfo info) { DataSet<byte[]> parent = sets.getDataSet(info.parentID); parent.map(new StringTupleDeserializerMap()).setParallelism(info.parallelism).name("CsvSinkPreStep") .writeAsCsv(info.path, info.lineDelimiter, info.fieldDelimiter, info.writeMode).setParallelism(info.parallelism).name("CsvSink"); }
@Test public void testSortingParallelism4() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> ds = env.generateSequence(0, 1000); // randomize ds.map(new MapFunction<Long, Long>() { Random rand = new Random(1234L); @Override public Long map(Long value) throws Exception { return rand.nextLong(); } }).writeAsText(resultPath) .sortLocalOutput("*", Order.ASCENDING) .setParallelism(4); env.execute(); BufferedReader[] resReaders = getResultReader(resultPath); for (BufferedReader br : resReaders) { long cmp = Long.MIN_VALUE; while (br.ready()) { long cur = Long.parseLong(br.readLine()); assertTrue("Invalid order of sorted output", cmp <= cur); cmp = cur; } br.close(); } }
@Test public void testIntSortingParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Integer> ds = CollectionDataSets.getIntegerDataSet(env); ds.writeAsText(resultPath).sortLocalOutput("*", Order.DESCENDING).setParallelism(1); env.execute(); String expected = "5\n5\n5\n5\n5\n4\n4\n4\n4\n3\n3\n3\n2\n2\n1\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testStringSortingParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<String> ds = CollectionDataSets.getStringDataSet(env); ds.writeAsText(resultPath).sortLocalOutput("*", Order.ASCENDING).setParallelism(1); env.execute(); String expected = "Hello\n" + "Hello world\n" + "Hello world, how are you?\n" + "Hi\n" + "I am fine.\n" + "LOL\n" + "Luke Skywalker\n" + "Random comment\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testPojoSortingSingleParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CollectionDataSets.POJO> ds = CollectionDataSets.getMixedPojoDataSet(env); ds.writeAsText(resultPath).sortLocalOutput("number", Order.ASCENDING).setParallelism(1); env.execute(); String expected = "1 First (10,100,1000,One) 10100\n" + "2 First_ (10,105,1000,One) 10200\n" + "3 First (11,102,3000,One) 10200\n" + "4 First_ (11,106,1000,One) 10300\n" + "5 First (11,102,2000,One) 10100\n" + "6 Second_ (20,200,2000,Two) 10100\n" + "7 Third (31,301,2000,Three) 10200\n" + "8 Third_ (30,300,1000,Three) 10100\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
@Test public void testTupleSortingNestedParallelism1_2() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Tuple2<Integer, Integer>, String, Integer>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet2(env); ds.writeAsText(resultPath) .sortLocalOutput(1, Order.ASCENDING) .sortLocalOutput(2, Order.DESCENDING) .setParallelism(1); env.execute(); String expected = "((2,1),a,3)\n" + "((1,3),a,2)\n" + "((1,2),a,1)\n" + "((2,2),b,4)\n" + "((4,9),c,7)\n" + "((3,6),c,6)\n" + "((3,3),c,5)\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testPojoSortingDualParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CollectionDataSets.POJO> ds = CollectionDataSets.getMixedPojoDataSet(env); ds.writeAsText(resultPath) .sortLocalOutput("str", Order.ASCENDING) .sortLocalOutput("number", Order.DESCENDING) .setParallelism(1); env.execute(); String expected = "5 First (11,102,2000,One) 10100\n" + "3 First (11,102,3000,One) 10200\n" + "1 First (10,100,1000,One) 10100\n" + "4 First_ (11,106,1000,One) 10300\n" + "2 First_ (10,105,1000,One) 10200\n" + "6 Second_ (20,200,2000,Two) 10100\n" + "7 Third (31,301,2000,Three) 10200\n" + "8 Third_ (30,300,1000,Three) 10100\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testTupleSortingNestedParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Tuple2<Integer, Integer>, String, Integer>> ds = CollectionDataSets.getGroupSortedNestedTupleDataSet2(env); ds.writeAsText(resultPath) .sortLocalOutput("f0.f1", Order.ASCENDING) .sortLocalOutput("f1", Order.DESCENDING) .setParallelism(1); env.execute(); String expected = "((2,1),a,3)\n" + "((2,2),b,4)\n" + "((1,2),a,1)\n" + "((3,3),c,5)\n" + "((1,3),a,2)\n" + "((3,6),c,6)\n" + "((4,9),c,7)\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testTupleSortingSingleAscParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); ds.writeAsCsv(resultPath).sortLocalOutput(0, Order.ASCENDING).setParallelism(1); env.execute(); String expected = "1,1,Hi\n" + "2,2,Hello\n" + "3,2,Hello world\n" + "4,3,Hello world, how are you?\n" + "5,3,I am fine.\n" + "6,3,Luke Skywalker\n" + "7,4,Comment#1\n" + "8,4,Comment#2\n" + "9,4,Comment#3\n" + "10,4,Comment#4\n" + "11,5,Comment#5\n" + "12,5,Comment#6\n" + "13,5,Comment#7\n" + "14,5,Comment#8\n" + "15,5,Comment#9\n" + "16,6,Comment#10\n" + "17,6,Comment#11\n" + "18,6,Comment#12\n" + "19,6,Comment#13\n" + "20,6,Comment#14\n" + "21,6,Comment#15\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testTupleSortingSingleDescParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env); ds.writeAsCsv(resultPath).sortLocalOutput(0, Order.DESCENDING).setParallelism(1); env.execute(); String expected = "21,6,Comment#15\n" + "20,6,Comment#14\n" + "19,6,Comment#13\n" + "18,6,Comment#12\n" + "17,6,Comment#11\n" + "16,6,Comment#10\n" + "15,5,Comment#9\n" + "14,5,Comment#8\n" + "13,5,Comment#7\n" + "12,5,Comment#6\n" + "11,5,Comment#5\n" + "10,4,Comment#4\n" + "9,4,Comment#3\n" + "8,4,Comment#2\n" + "7,4,Comment#1\n" + "6,3,Luke Skywalker\n" + "5,3,I am fine.\n" + "4,3,Hello world, how are you?\n" + "3,2,Hello world\n" + "2,2,Hello\n" + "1,1,Hi\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
@Test public void testPojoSortingNestedParallelism1() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<CollectionDataSets.POJO> ds = CollectionDataSets.getMixedPojoDataSet(env); ds.writeAsText(resultPath) .sortLocalOutput("nestedTupleWithCustom.f0", Order.ASCENDING) .sortLocalOutput("nestedTupleWithCustom.f1.myInt", Order.DESCENDING) .sortLocalOutput("nestedPojo.longNumber", Order.ASCENDING) .setParallelism(1); env.execute(); String expected = "2 First_ (10,105,1000,One) 10200\n" + "1 First (10,100,1000,One) 10100\n" + "4 First_ (11,106,1000,One) 10300\n" + "5 First (11,102,2000,One) 10100\n" + "3 First (11,102,3000,One) 10200\n" + "6 Second_ (20,200,2000,Two) 10100\n" + "8 Third_ (30,300,1000,Three) 10100\n" + "7 Third (31,301,2000,Three) 10200\n"; compareResultsByLinesInMemoryWithStrictOrder(expected, resultPath); }
ds.writeAsCsv(resultPath) .sortLocalOutput(1, Order.DESCENDING).sortLocalOutput(0, Order.ASCENDING) .setParallelism(1);
@Override public EdgeMetrics<K, VV, EV> run(Graph<K, VV, EV> input) throws Exception { super.run(input); // s, t, (d(s), d(t)) DataSet<Edge<K, Tuple3<EV, LongValue, LongValue>>> edgeDegreePair = input .run(new EdgeDegreePair<K, VV, EV>() .setReduceOnTargetId(reduceOnTargetId) .setParallelism(parallelism)); // s, d(s), count of (u, v) where deg(u) < deg(v) or (deg(u) == deg(v) and u < v) DataSet<Tuple3<K, LongValue, LongValue>> edgeStats = edgeDegreePair .map(new EdgeStats<>()) .setParallelism(parallelism) .name("Edge stats") .groupBy(0) .reduce(new SumEdgeStats<>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum edge stats"); edgeMetricsHelper = new EdgeMetricsHelper<>(); edgeStats .output(edgeMetricsHelper) .setParallelism(parallelism) .name("Edge metrics"); return this; }
@Override public EdgeMetrics<K, VV, EV> run(Graph<K, VV, EV> input) throws Exception { super.run(input); // s, t, (d(s), d(t)) DataSet<Edge<K, Tuple3<EV, Degrees, Degrees>>> edgeDegreesPair = input .run(new EdgeDegreesPair<K, VV, EV>() .setParallelism(parallelism)); // s, d(s), count of (u, v) where deg(u) < deg(v) or (deg(u) == deg(v) and u < v) DataSet<Tuple3<K, Degrees, LongValue>> edgeStats = edgeDegreesPair .flatMap(new EdgeStats<>()) .setParallelism(parallelism) .name("Edge stats") .groupBy(0, 1) .reduceGroup(new ReduceEdgeStats<>()) .setParallelism(parallelism) .name("Reduce edge stats") .groupBy(0) .reduce(new SumEdgeStats<>()) .setCombineHint(CombineHint.HASH) .setParallelism(parallelism) .name("Sum edge stats"); edgeMetricsHelper = new EdgeMetricsHelper<>(); edgeStats .output(edgeMetricsHelper) .setParallelism(parallelism) .name("Edge metrics"); return this; }
.groupBy("*").reduceGroup(new IdentityGroupReducer<Long>()) .withForwardedFields("*").setParallelism(p * 2).name("Reduce2") .output(new DiscardingOutputFormat<Long>()).setParallelism(p * 2).name("Sink");
.groupBy("*").reduceGroup(new IdentityGroupReducer<Long>()) .withForwardedFields("*").setParallelism(p * 2).name("Reduce2") .output(new DiscardingOutputFormat<Long>()).setParallelism(p * 2).name("Sink");
.groupBy("*").reduceGroup(new IdentityGroupReducer<Long>()) .withForwardedFields("*").setParallelism(p).name("Reduce2") .output(new DiscardingOutputFormat<Long>()).setParallelism(p).name("Sink");