org.apache.hadoop.mapreduce.lib.input.NLineInputFormat java code examples

NLineInputFormat.setInputPaths(job, inputDir);
NLineInputFormat.setNumLinesPerSplit(job, 1);

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see FileInputFormat#getSplits(JobContext)
 */
public List<InputSplit> getSplits(JobContext job)
throws IOException {
 List<InputSplit> splits = new ArrayList<InputSplit>();
 int numLinesPerSplit = getNumLinesPerSplit(job);
 for (FileStatus status : listStatus(job)) {
  splits.addAll(getSplitsForFile(status,
   job.getConfiguration(), numLinesPerSplit));
 }
 return splits;
}

job2.setJobName(getClass().getName() + "/" + Utils.getShortClassName(LineRandomizerMapper.class));
job2.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job2, fullInputList);
NLineInputFormat.setNumLinesPerSplit(job2, numLinesPerSplit);          
job2.setMapperClass(LineRandomizerMapper.class);
job2.setReducerClass(LineRandomizerReducer.class);

public void testFormat() throws Exception {
 Job job = Job.getInstance(conf);
 Path file = new Path(workDir, "test.txt");
 int seed = new Random().nextInt();
 Random random = new Random(seed);
 localFs.delete(workDir, true);
 FileInputFormat.setInputPaths(job, workDir);
 int numLinesPerMap = 5;
 NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap);
 // for a variety of lengths
 for (int length = 0; length < MAX_LENGTH;
    length += random.nextInt(MAX_LENGTH / 10) + 1) {
  // create a file with length entries
  Writer writer = new OutputStreamWriter(localFs.create(file));
  try {
   for (int i = 0; i < length; i++) {
    writer.write(Integer.toString(i));
    writer.write("\n");
   }
  } finally {
   writer.close();
  }
  checkFormat(job, numLinesPerMap);
 }
}

  /**
   * Logically splits the set of input files for the job, splits N lines of
   * the input as one split.
   * 
   * @see FileInputFormat#getSplits(JobContext)
   */
  @Override
  public final List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean debug = LOGGER.isDebugEnabled();
    if (debug && FileInputFormat.getInputDirRecursive(job)) {
      LOGGER.debug("Recursive searching for input data is enabled");
    }
    
    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = NLineInputFormat.getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
      if (debug) {
        LOGGER.debug("Determining how to split input file/directory {}", status.getPath());
      }
      splits.addAll(NLineInputFormat.getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
    }
    return splits;
  }
}

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int)
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
 ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
 for (FileStatus status : listStatus(job)) {
  for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : 
    org.apache.hadoop.mapreduce.lib.input.
    NLineInputFormat.getSplitsForFile(status, job, N)) {
   splits.add(new FileSplit(split));
  }
 }
 return splits.toArray(new FileSplit[splits.size()]);
}

length += num;
if (numLines == numLinesPerSplit) {
 splits.add(createFileSplit(fileName, begin, length));
 begin += length;
 length = 0;
splits.add(createFileSplit(fileName, begin, length));

NLineInputFormat.addInputPath(job, tmp);

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see FileInputFormat#getSplits(JobContext)
 */
public List<InputSplit> getSplits(JobContext job)
throws IOException {
 List<InputSplit> splits = new ArrayList<InputSplit>();
 int numLinesPerSplit = getNumLinesPerSplit(job);
 for (FileStatus status : listStatus(job)) {
  splits.addAll(getSplitsForFile(status,
   job.getConfiguration(), numLinesPerSplit));
 }
 return splits;
}

job2.setJobName(getClass().getName() + "/" + Utils.getShortClassName(LineRandomizerMapper.class));
job2.setInputFormatClass(NLineInputFormat.class);
NLineInputFormat.addInputPath(job2, fullInputList);
NLineInputFormat.setNumLinesPerSplit(job2, numLinesPerSplit);          
job2.setMapperClass(LineRandomizerMapper.class);
job2.setReducerClass(LineRandomizerReducer.class);

NLineInputFormat.setNumLinesPerSplit(job, 1);

  /**
   * Logically splits the set of input files for the job, splits N lines of
   * the input as one split.
   * 
   * @see FileInputFormat#getSplits(JobContext)
   */
  @Override
  public final List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean debug = LOGGER.isDebugEnabled();
    if (debug && FileInputFormat.getInputDirRecursive(job)) {
      LOGGER.debug("Recursive searching for input data is enabled");
    }
    
    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = NLineInputFormat.getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
      if (debug) {
        LOGGER.debug("Determining how to split input file/directory {}", status.getPath());
      }
      splits.addAll(NLineInputFormat.getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
    }
    return splits;
  }
}

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int)
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
 ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
 for (FileStatus status : listStatus(job)) {
  for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : 
    org.apache.hadoop.mapreduce.lib.input.
    NLineInputFormat.getSplitsForFile(status, job, N)) {
   splits.add(new FileSplit(split));
  }
 }
 return splits.toArray(new FileSplit[splits.size()]);
}

length += num;
if (numLines == numLinesPerSplit) {
 splits.add(createFileSplit(fileName, begin, length));
 begin += length;
 length = 0;
splits.add(createFileSplit(fileName, begin, length));

NLineInputFormat.addInputPath(ret, mapperInputFilePath);

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see FileInputFormat#getSplits(JobContext)
 */
public List<InputSplit> getSplits(JobContext job)
throws IOException {
 List<InputSplit> splits = new ArrayList<InputSplit>();
 int numLinesPerSplit = getNumLinesPerSplit(job);
 for (FileStatus status : listStatus(job)) {
  splits.addAll(getSplitsForFile(status,
   job.getConfiguration(), numLinesPerSplit));
 }
 return splits;
}

  throw new IllegalStateException("Not same reducers: " + reducers + ", numFiles: " + numFiles);
NLineInputFormat.addInputPath(job, fullInputList);
NLineInputFormat.setNumLinesPerSplit(job, options.fanout);
FileOutputFormat.setOutputPath(job, outputTreeMergeStep);

NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be

NLineInputFormat.setInputPaths(job, inputDir);
NLineInputFormat.setNumLinesPerSplit(job, 1);

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int)
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
 ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
 for (FileStatus status : listStatus(job)) {
  for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : 
    org.apache.hadoop.mapreduce.lib.input.
    NLineInputFormat.getSplitsForFile(status, job, N)) {
   splits.add(new FileSplit(split));
  }
 }
 return splits.toArray(new FileSplit[splits.size()]);
}

Javadoc

NLineInputFormat which splits N lines of input as one split. In many "pleasantly" parallel applications, each process/mapper processes the same input file (s), but with computations are controlled by different parameters.(Referred to as "parameter sweeps"). One way to achieve this, is to specify a set of parameters (one set per line) as input in a control file (which is the input path to the map-reduce application, where as the input dataset is specified via a config variable in JobConf.). The NLineInputFormat can be used in such applications, that splits the input file such that by default, one line is fed as a value to one map task, and key is the offset. i.e. (k,v) is (LongWritable, Text). The location hints will span the whole mapred cluster.

Most used methods

setNumLinesPerSplit
Set the number of lines per split
getNumLinesPerSplit
Get the number of lines per split
getSplitsForFile
listStatus
addInputPath
createFileSplit
NLineInputFormat uses LineRecordReader, which always reads (and consumes) at least one character out
<init>
createRecordReader
getSplits
Logically splits the set of input files for the job, splits N lines of the input as one split.
setInputPaths

Popular in Java

Finding current android device location
findViewById (Activity)
compareTo (BigDecimal)
getSystemService (Context)
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Executors (java.util.concurrent)
Factory and utility methods for Executor, ExecutorService, ScheduledExecutorService, ThreadFactory,
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
CodeWhisperer alternatives

How to useNLineInputFormat in org.apache.hadoop.mapreduce.lib.input

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.NLineInputFormat (Showing top 20 results out of 315)

How to use
NLineInputFormat
in
org.apache.hadoop.mapreduce.lib.input