org.apache.hadoop.mapreduce.lib.input java code examples

 @Override
 public RecordReader<AvroKey<GenericRecord>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext cx)
   throws IOException {
  return new CombineFileRecordReader<>((CombineFileSplit) split, cx, AvroKeyCombineFileRecordReader.class);
 }
}

@Override
public void initialize(InputSplit unusedSplit, TaskAttemptContext cx) throws IOException, InterruptedException {
 super.initialize(
   new FileSplit(this.split.getPath(this.idx), this.split.getOffset(this.idx), this.split.getLength(this.idx),
     null), cx);
}

private void setupMapper(Path input) throws IOException {
  FileInputFormat.setInputPaths(job, input);
  job.setMapperClass(CalculateStatsFromBaseCuboidMapper.class);
  job.setInputFormatClass(SequenceFileInputFormat.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(Text.class);
}

 @Override
 public void initialize(InputSplit split, TaskAttemptContext context) throws IOException,
  InterruptedException {

  FileSplit fSplit = (FileSplit) split;
  Path path = fSplit.getPath();
  Configuration conf = context.getConfiguration();
  this.in = new RCFile.Reader(path.getFileSystem(conf), path, conf);
  this.end = fSplit.getStart() + fSplit.getLength();

  if (fSplit.getStart() > in.getPosition()) {
   in.sync(fSplit.getStart());
  }

  this.start = in.getPosition();
  more = start < end;

  key = new LongWritable();
  value = new BytesRefArrayWritable();
 }
}

@Override
public CubeSegment findSourceSegment(FileSplit fileSplit, CubeInstance cube) {
  String filePath = fileSplit.getPath().toString();
  String jobID = JobBuilderSupport.extractJobIDFromPath(filePath);
  return CubeInstance.findSegmentWithJobId(jobID, cube);
}

/**
 * Creates a Flink {@link InputFormat} that wraps the given Hadoop {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}.
 *
 * @return A Flink InputFormat that wraps the Hadoop FileInputFormat.
 */
public static <K, V> org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat<K, V> readHadoopFile(
    org.apache.hadoop.mapreduce.lib.input.FileInputFormat<K, V> mapreduceInputFormat, Class<K> key, Class<V> value, String inputPath, Job job) throws IOException {
  // set input path in Job
  org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new org.apache.hadoop.fs.Path(inputPath));
  // return wrapping InputFormat
  return createHadoopInput(mapreduceInputFormat, key, value, job);
}

@Override public List<InputSplit> getSplits(JobContext ctx) throws IOException, InterruptedException {
  List<InputSplit> res = new ArrayList<>(BLOCK_CNT);
  for (int i = 0; i < BLOCK_CNT; i++)
    try {
      res.add(new FileSplit(new Path(new URI("someFile")), i, i + 1, new String[] {"localhost"}));
    }
    catch (URISyntaxException e) {
      throw new IOException(e);
    }
  return res;
}

private static Schema getSchema(CombineFileSplit split, TaskAttemptContext cx, Integer idx) throws IOException {
 Schema schema = AvroJob.getInputKeySchema(cx.getConfiguration());
 if (schema != null) {
  return schema;
 }
 Path path = split.getPath(idx);
 FileSystem fs = path.getFileSystem(cx.getConfiguration());
 return AvroUtils.getSchemaFromDataFile(path, fs);
}

@Override
public List<InputSplit> getSplits(JobContext cx) throws IOException {
 Job modifiedJob = Job.getInstance(cx.getConfiguration());
 setSplitSize(modifiedJob);
 FileInputFormat.setInputDirRecursive(modifiedJob, true);
 return cleanSplits(super.getSplits(modifiedJob));
}

 @Override
 public List<InputSplit> getSplits(JobContext job) throws IOException {
  HiveConf.setLongVar(job.getConfiguration(),
    HiveConf.ConfVars.MAPREDMINSPLITSIZE, SequenceFile.SYNC_INTERVAL);
  return super.getSplits(job);
 }
}

private static void addInputPath(Job job, Iterable<String> pathStrings, Class<? extends InputFormat> inputFormatClass)
{
 Configuration conf = job.getConfiguration();
 StringBuilder inputFormats = new StringBuilder(
   StringUtils.nullToEmptyNonDruidDataString(conf.get(MultipleInputs.DIR_FORMATS))
 );
 String[] paths = Iterables.toArray(pathStrings, String.class);
 for (int i = 0; i < paths.length - 1; i++) {
  if (inputFormats.length() > 0) {
   inputFormats.append(',');
  }
  inputFormats.append(paths[i]).append(';').append(inputFormatClass.getName());
 }
 if (inputFormats.length() > 0) {
  conf.set(MultipleInputs.DIR_FORMATS, inputFormats.toString());
 }
 // add last one separately for possible initialization in MultipleInputs
 MultipleInputs.addInputPath(job, new Path(paths[paths.length - 1]), inputFormatClass);
}

  @Override public List<InputSplit> getSplits(JobContext ctx) throws IOException {
    List<InputSplit> res = super.getSplits(ctx);
    splitsCount.set(res.size());
    X.println("___ split of input: " + splitsCount.get());
    return res;
  }
}

private void setSplitSize(JobContext cx) {
 super.setMaxSplitSize(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE,
   DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE));
 super.setMinSplitSizeNode(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE,
   DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE));
}

@Override
public RecordReader<NullWritable, OrcStruct> createRecordReader(
  InputSplit inputSplit, TaskAttemptContext context)
  throws IOException, InterruptedException {
 FileSplit fileSplit = (FileSplit) inputSplit;
 Path path = fileSplit.getPath();
 Configuration conf = ShimLoader.getHadoopShims()
   .getConfiguration(context);
 return new OrcRecordReader(OrcFile.createReader(path,
                         OrcFile.readerOptions(conf)),
   ShimLoader.getHadoopShims().getConfiguration(context),
   fileSplit.getStart(), fileSplit.getLength());
}

/**
 * Job configuration.
 */
public static Job configureJob(Configuration conf, String [] args)
throws IOException {
 Path inputPath = new Path(args[0]);
 String tableName = args[1];
 Job job = new Job(conf, NAME + "_" + tableName);
 job.setJarByClass(Uploader.class);
 FileInputFormat.setInputPaths(job, inputPath);
 job.setInputFormatClass(SequenceFileInputFormat.class);
 job.setMapperClass(Uploader.class);
 // No reducers.  Just write straight to table.  Call initTableReducerJob
 // because it sets up the TableOutputFormat.
 TableMapReduceUtil.initTableReducerJob(tableName, null, job);
 job.setNumReduceTasks(0);
 return job;
}

@Override
public void initialize(InputSplit split, TaskAttemptContext context)
  throws IOException, InterruptedException {
 FileSplit fileSplit = (FileSplit) split;
 conf = context.getConfiguration();
 Path path = fileSplit.getPath();
 FileSystem fs = path.getFileSystem(conf);
 LOG.info("Initialize HFileRecordReader for {}", path);
 this.in = HFile.createReader(fs, path, conf);
 // The file info must be loaded before the scanner can be used.
 // This seems like a bug in HBase, but it's easily worked around.
 this.in.loadFileInfo();
 this.scanner = in.getScanner(false, false);
}

private void configureInputAndOutputPaths(Job job) throws IOException {
 for (Path inputPath : getInputPaths()) {
  FileInputFormat.addInputPath(job, inputPath);
 }
 //MR output path must not exist when MR job starts, so delete if exists.
 this.tmpFs.delete(this.dataset.outputTmpPath(), true);
 FileOutputFormat.setOutputPath(job, this.dataset.outputTmpPath());
}

@Override
public RecordReader<NullWritable, OrcStruct> createRecordReader(
  InputSplit inputSplit, TaskAttemptContext context)
  throws IOException, InterruptedException {
 FileSplit fileSplit = (FileSplit) inputSplit;
 Path path = fileSplit.getPath();
 Configuration conf = ShimLoader.getHadoopShims()
   .getConfiguration(context);
 return new OrcRecordReader(OrcFile.createReader(path,
                         OrcFile.readerOptions(conf)),
   ShimLoader.getHadoopShims().getConfiguration(context),
   fileSplit.getStart(), fileSplit.getLength());
}

/**
 * Gets fully configured Job instance.
 *
 * @param input Input file name.
 * @param output Output directory name.
 * @return Job instance.
 * @throws IOException If fails.
 */
public static Job getJob(String input, String output) throws IOException {
  Job job = Job.getInstance();
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  setTasksClasses(job, true, true, true, false);
  FileInputFormat.setInputPaths(job, new Path(input));
  FileOutputFormat.setOutputPath(job, new Path(output));
  job.setJarByClass(HadoopWordCount2.class);
  return job;
}

 public void checkInputFormat() throws Exception {
  Job job = new Job();

  WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyValueTest");

  job.setMapperClass(Counter.class);

  FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*"));
  job.setInputFormatClass(AvroTrevniKeyValueInputFormat.class);

  job.setNumReduceTasks(0);
  job.setOutputFormatClass(NullOutputFormat.class);

  total = 0;
  job.waitForCompletion(true);
  assertEquals(WordCountUtil.TOTAL, total);

 }
}

Most used classes

FileInputFormat
A base class for file-based InputFormats.FileInputFormat is the base class for all file-basedInputFo
FileSplit
A section of an input file. Returned by InputFormat#getSplits(JobContext) and passed to InputFormat#
CombineFileSplit
A sub-collection of input files. Unlike FileSplit, CombineFileSplit class does not represent a split
TextInputFormat
An InputFormat for plain text files. Files are broken into lines. Either linefeed or carriage-return
LineRecordReader
Treats keys as offset in file and value as line.

How to use org.apache.hadoop.mapreduce.lib.input

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input (Showing top 20 results out of 1,944)