NLineInputFormat.setInputPaths(job, inputDir); NLineInputFormat.setNumLinesPerSplit(job, 1);
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see FileInputFormat#getSplits(JobContext) */ public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; }
job2.setJobName(getClass().getName() + "/" + Utils.getShortClassName(LineRandomizerMapper.class)); job2.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job2, fullInputList); NLineInputFormat.setNumLinesPerSplit(job2, numLinesPerSplit); job2.setMapperClass(LineRandomizerMapper.class); job2.setReducerClass(LineRandomizerReducer.class);
public void testFormat() throws Exception { Job job = Job.getInstance(conf); Path file = new Path(workDir, "test.txt"); int seed = new Random().nextInt(); Random random = new Random(seed); localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, workDir); int numLinesPerMap = 5; NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap); // for a variety of lengths for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) { // create a file with length entries Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < length; i++) { writer.write(Integer.toString(i)); writer.write("\n"); } } finally { writer.close(); } checkFormat(job, numLinesPerMap); } }
/** * Logically splits the set of input files for the job, splits N lines of * the input as one split. * * @see FileInputFormat#getSplits(JobContext) */ @Override public final List<InputSplit> getSplits(JobContext job) throws IOException { boolean debug = LOGGER.isDebugEnabled(); if (debug && FileInputFormat.getInputDirRecursive(job)) { LOGGER.debug("Recursive searching for input data is enabled"); } List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = NLineInputFormat.getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { if (debug) { LOGGER.debug("Determining how to split input file/directory {}", status.getPath()); } splits.addAll(NLineInputFormat.getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; } }
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int) */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); for (FileStatus status : listStatus(job)) { for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : org.apache.hadoop.mapreduce.lib.input. NLineInputFormat.getSplitsForFile(status, job, N)) { splits.add(new FileSplit(split)); } } return splits.toArray(new FileSplit[splits.size()]); }
length += num; if (numLines == numLinesPerSplit) { splits.add(createFileSplit(fileName, begin, length)); begin += length; length = 0; splits.add(createFileSplit(fileName, begin, length));
NLineInputFormat.addInputPath(job, tmp);
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see FileInputFormat#getSplits(JobContext) */ public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; }
job2.setJobName(getClass().getName() + "/" + Utils.getShortClassName(LineRandomizerMapper.class)); job2.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job2, fullInputList); NLineInputFormat.setNumLinesPerSplit(job2, numLinesPerSplit); job2.setMapperClass(LineRandomizerMapper.class); job2.setReducerClass(LineRandomizerReducer.class);
NLineInputFormat.setNumLinesPerSplit(job, 1);
/** * Logically splits the set of input files for the job, splits N lines of * the input as one split. * * @see FileInputFormat#getSplits(JobContext) */ @Override public final List<InputSplit> getSplits(JobContext job) throws IOException { boolean debug = LOGGER.isDebugEnabled(); if (debug && FileInputFormat.getInputDirRecursive(job)) { LOGGER.debug("Recursive searching for input data is enabled"); } List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = NLineInputFormat.getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { if (debug) { LOGGER.debug("Determining how to split input file/directory {}", status.getPath()); } splits.addAll(NLineInputFormat.getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; } }
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int) */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); for (FileStatus status : listStatus(job)) { for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : org.apache.hadoop.mapreduce.lib.input. NLineInputFormat.getSplitsForFile(status, job, N)) { splits.add(new FileSplit(split)); } } return splits.toArray(new FileSplit[splits.size()]); }
length += num; if (numLines == numLinesPerSplit) { splits.add(createFileSplit(fileName, begin, length)); begin += length; length = 0; splits.add(createFileSplit(fileName, begin, length));
NLineInputFormat.addInputPath(ret, mapperInputFilePath);
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see FileInputFormat#getSplits(JobContext) */ public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { splits.addAll(getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; }
throw new IllegalStateException("Not same reducers: " + reducers + ", numFiles: " + numFiles); NLineInputFormat.addInputPath(job, fullInputList); NLineInputFormat.setNumLinesPerSplit(job, options.fanout); FileOutputFormat.setOutputPath(job, outputTreeMergeStep);
NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be
NLineInputFormat.setInputPaths(job, inputDir); NLineInputFormat.setNumLinesPerSplit(job, 1);
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int) */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); for (FileStatus status : listStatus(job)) { for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : org.apache.hadoop.mapreduce.lib.input. NLineInputFormat.getSplitsForFile(status, job, N)) { splits.add(new FileSplit(split)); } } return splits.toArray(new FileSplit[splits.size()]); }