public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
private void task2(String inputPath, String outputPath) throws IOException { LOG.info("Exracting anchor text (phase 2)..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class); conf.setJobName(String.format( "ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath)); // Gathers everything together for convenience; feasible for Wikipedia. conf.setNumReduceTasks(1); TextInputFormat.addInputPath(conf, new Path(inputPath)); TextOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(MapFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HMapStIW.class); conf.setMapperClass(MyMapper2.class); conf.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); // Clean up intermediate data. FileSystem.get(conf).delete(new Path(inputPath), true); }
private void task2(String inputPath, String outputPath, int partitions) throws IOException { LOG.info("Building adjacency lists..."); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); JobConf conf = new JobConf(getConf(), ExtractWikipediaLinkGraph.class); conf.setJobName(String.format( "ExtractWikipediaLinkGraph:AdjacencyList[input: %s, output: %s, num_partitions: %d]", inputPath, outputPath, partitions)); conf.setNumReduceTasks(partitions); TextInputFormat.addInputPath(conf, new Path(inputPath)); TextOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper2.class); conf.setReducerClass(MyReducer2.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); }
jobConfig.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(jobConfig, outputPath); TextInputFormat.addInputPath(jobConfig, inputPath);
public void testNumInputs() throws Exception { JobConf job = new JobConf(conf); dfs = newDFSCluster(job); FileSystem fs = dfs.getFileSystem(); System.out.println("FileSystem " + fs.getUri()); Path inputDir = new Path("/foo/"); final int numFiles = 10; String fileNameBase = "part-0000"; for (int i=0; i < numFiles; ++i) { createInputs(fs, inputDir, fileNameBase + String.valueOf(i)); } createInputs(fs, inputDir, "_meta"); createInputs(fs, inputDir, "_temp"); // split it using a file input format TextInputFormat.addInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.configure(job); InputSplit[] splits = inFormat.getSplits(job, 1); assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES, numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0)); }
TextInputFormat.addInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.configure(job);
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>( new TextInputFormat(), new JobConf(), "Input Lines"); TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput)); MapOperator mapper = MapOperator.builder(new TokenizeLine()) .input(source) .name("Tokenize Lines") .build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0) .input(mapper) .name("Count Words") .build(); HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class); TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output)); Plan plan = new Plan(out, "Hadoop OutputFormat Example"); plan.setDefaultParallelism(numSubTasks); return plan; }
TextInputFormat.addInputPath(job, shapeFile); DistributedCache.addCacheFile(categoryFile.toUri(), job);
TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput)); TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
public void testNumInputs() throws Exception { JobConf job = new JobConf(conf); dfs = newDFSCluster(job); FileSystem fs = dfs.getFileSystem(); System.out.println("FileSystem " + fs.getUri()); Path inputDir = new Path("/foo/"); final int numFiles = 10; String fileNameBase = "part-0000"; for (int i=0; i < numFiles; ++i) { createInputs(fs, inputDir, fileNameBase + String.valueOf(i)); } createInputs(fs, inputDir, "_meta"); createInputs(fs, inputDir, "_temp"); // split it using a file input format TextInputFormat.addInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.configure(job); InputSplit[] splits = inFormat.getSplits(job, 1); assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES, numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0)); }
TextInputFormat.addInputPath(job, inputDir); TextInputFormat inFormat = new TextInputFormat(); inFormat.configure(job);