org.apache.hadoop.mapred.TextInputFormat.addInputPath java code examples

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: WordCount <input path> <result path>");
    return;
  }
  final String inputPath = args[0];
  final String outputPath = args[1];
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // Set up the Hadoop Input Format
  HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
  TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));
  // Create a Flink job with it
  DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
  DataSet<Tuple2<Text, LongWritable>> words =
      text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer()))
        .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter()));
  // Set up Hadoop Output Format
  HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
      new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
  hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
  TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));
  // Output & Execute
  words.output(hadoopOutputFormat).setParallelism(1);
  env.execute("Hadoop Compat WordCount");
}

private void task2(String inputPath, String outputPath) throws IOException {
 LOG.info("Exracting anchor text (phase 2)...");
 LOG.info(" - input: " + inputPath);
 LOG.info(" - output: " + outputPath);
 JobConf conf = new JobConf(getConf(), ExtractWikipediaAnchorText.class);
 conf.setJobName(String.format(
   "ExtractWikipediaAnchorText:phase2[input: %s, output: %s]", inputPath, outputPath));
 // Gathers everything together for convenience; feasible for Wikipedia.
 conf.setNumReduceTasks(1);
 TextInputFormat.addInputPath(conf, new Path(inputPath));
 TextOutputFormat.setOutputPath(conf, new Path(outputPath));
 conf.setInputFormat(SequenceFileInputFormat.class);
 conf.setOutputFormat(MapFileOutputFormat.class);
 conf.setMapOutputKeyClass(IntWritable.class);
 conf.setMapOutputValueClass(Text.class);
 conf.setOutputKeyClass(IntWritable.class);
 conf.setOutputValueClass(HMapStIW.class);
 conf.setMapperClass(MyMapper2.class);
 conf.setReducerClass(MyReducer2.class);
 // Delete the output directory if it exists already.
 FileSystem.get(conf).delete(new Path(outputPath), true);
 JobClient.runJob(conf);
 // Clean up intermediate data.
 FileSystem.get(conf).delete(new Path(inputPath), true);
}

private void task2(String inputPath, String outputPath, int partitions) throws IOException {
 LOG.info("Building adjacency lists...");
 LOG.info(" - input: " + inputPath);
 LOG.info(" - output: " + outputPath);
 JobConf conf = new JobConf(getConf(), ExtractWikipediaLinkGraph.class);
 conf.setJobName(String.format(
   "ExtractWikipediaLinkGraph:AdjacencyList[input: %s, output: %s, num_partitions: %d]",
   inputPath, outputPath, partitions));
 conf.setNumReduceTasks(partitions);
 TextInputFormat.addInputPath(conf, new Path(inputPath));
 TextOutputFormat.setOutputPath(conf, new Path(outputPath));
 conf.setInputFormat(TextInputFormat.class);
 conf.setOutputFormat(TextOutputFormat.class);
 conf.setMapOutputKeyClass(IntWritable.class);
 conf.setMapOutputValueClass(IntWritable.class);
 conf.setOutputKeyClass(IntWritable.class);
 conf.setOutputValueClass(Text.class);
 conf.setMapperClass(MyMapper2.class);
 conf.setReducerClass(MyReducer2.class);
 // Delete the output directory if it exists already.
 FileSystem.get(conf).delete(new Path(outputPath), true);
 JobClient.runJob(conf);
}

jobConfig.setOutputFormat(TextOutputFormat.class);
TextOutputFormat.setOutputPath(jobConfig, outputPath);
TextInputFormat.addInputPath(jobConfig, inputPath);

public void testNumInputs() throws Exception {
 JobConf job = new JobConf(conf);
 dfs = newDFSCluster(job);
 FileSystem fs = dfs.getFileSystem();
 System.out.println("FileSystem " + fs.getUri());
 Path inputDir = new Path("/foo/");
 final int numFiles = 10;
 String fileNameBase = "part-0000";
 for (int i=0; i < numFiles; ++i) {
  createInputs(fs, inputDir, fileNameBase + String.valueOf(i));
 }
 createInputs(fs, inputDir, "_meta");
 createInputs(fs, inputDir, "_temp");
 // split it using a file input format
 TextInputFormat.addInputPath(job, inputDir);
 TextInputFormat inFormat = new TextInputFormat();
 inFormat.configure(job);
 InputSplit[] splits = inFormat.getSplits(job, 1);
 assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES,
        numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0));
}

TextInputFormat.addInputPath(job, inputDir);
TextInputFormat inFormat = new TextInputFormat();
inFormat.configure(job);

@Override
public Plan getPlan(String... args) {
  // parse job parameters
  int numSubTasks   = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
  String dataInput = (args.length > 1 ? args[1] : "");
  String output    = (args.length > 2 ? args[2] : "");
  HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
      new TextInputFormat(), new JobConf(), "Input Lines");
  TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
  MapOperator mapper = MapOperator.builder(new TokenizeLine())
      .input(source)
      .name("Tokenize Lines")
      .build();
  ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0)
      .input(mapper)
      .name("Count Words")
      .build();
  HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class);
  TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));
  Plan plan = new Plan(out, "Hadoop OutputFormat Example");
  plan.setDefaultParallelism(numSubTasks);
  return plan;
}

TextInputFormat.addInputPath(job, shapeFile);
DistributedCache.addCacheFile(categoryFile.toUri(), job);

TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));
TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));

public void testNumInputs() throws Exception {
 JobConf job = new JobConf(conf);
 dfs = newDFSCluster(job);
 FileSystem fs = dfs.getFileSystem();
 System.out.println("FileSystem " + fs.getUri());
 Path inputDir = new Path("/foo/");
 final int numFiles = 10;
 String fileNameBase = "part-0000";
 for (int i=0; i < numFiles; ++i) {
  createInputs(fs, inputDir, fileNameBase + String.valueOf(i));
 }
 createInputs(fs, inputDir, "_meta");
 createInputs(fs, inputDir, "_temp");
 // split it using a file input format
 TextInputFormat.addInputPath(job, inputDir);
 TextInputFormat inFormat = new TextInputFormat();
 inFormat.configure(job);
 InputSplit[] splits = inFormat.getSplits(job, 1);
 assertEquals("Expected value of " + FileInputFormat.NUM_INPUT_FILES,
        numFiles, job.getLong(FileInputFormat.NUM_INPUT_FILES, 0));
}

TextInputFormat.addInputPath(job, inputDir);
TextInputFormat inFormat = new TextInputFormat();
inFormat.configure(job);

Popular methods of TextInputFormat

Popular in Java

Reactive rest calls using spring rest template
startActivity (Activity)
setScale (BigDecimal)
requestLocationUpdates (LocationManager)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
JFrame (javax.swing)
Top Vim plugins

How to use addInputPathmethodin org.apache.hadoop.mapred.TextInputFormat

Best Java code snippets using org.apache.hadoop.mapred.TextInputFormat.addInputPath (Showing top 11 results out of 315)

How to use
addInputPath
method
in
org.apache.hadoop.mapred.TextInputFormat