@Test public void testUngroupedHadoopReducer() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper2()); DataSet<Tuple2<IntWritable, IntWritable>> sum = ds. reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new SumReducer())); String resultPath = tempFolder.newFile().toURI().toString(); sum.writeAsText(resultPath); env.execute(); String expected = "(0,231)\n"; compareResultsByLinesInMemory(expected, resultPath); }
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
@Test public void testStandardCountingWithCombiner() throws Exception{ final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper1()); DataSet<Tuple2<IntWritable, IntWritable>> counts = ds. groupBy(0). reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new SumReducer())); String resultPath = tempFolder.newFile().toURI().toString(); counts.writeAsText(resultPath); env.execute(); String expected = "(0,5)\n" + "(1,6)\n" + "(2,6)\n" + "(3,4)\n"; compareResultsByLinesInMemory(expected, resultPath); }
@Test public void testCombiner() throws Exception { org.junit.Assume.assumeThat(mode, new IsEqual<TestExecutionMode>(TestExecutionMode.CLUSTER)); final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<IntWritable, IntWritable>> ds = HadoopTestData.getKVPairDataSet(env). map(new Mapper3()); DataSet<Tuple2<IntWritable, IntWritable>> counts = ds. groupBy(0). reduceGroup(new HadoopReduceCombineFunction<IntWritable, IntWritable, IntWritable, IntWritable>( new SumReducer(), new KeyChangingReducer())); String resultPath = tempFolder.newFile().toURI().toString(); counts.writeAsText(resultPath); env.execute(); String expected = "(0,5)\n" + "(1,6)\n" + "(2,5)\n" + "(3,5)\n"; compareResultsByLinesInMemory(expected, resultPath); }