org.apache.hadoop.mapred.FileInputFormat.getSplits java code examples

InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt(
  HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
if (fileSplits == null || fileSplits.length == 0) {

  @Override
  public InputSplit[] getSplits(JobConf conf, int splits) throws IOException {
    if (conf == lastConf) {
      return lastResult;
    }
    lastConf = conf;
    lastResult = super.getSplits(conf, splits);
    return lastResult;
  }
}

/**
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(org.apache.hadoop.mapred.JobConf,
 *      int)
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  InputSplit[] splits = super.getSplits(job, numSplits);
  log.debug("Returning {} splits", splits.length);
  return splits;
}

/**
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(org.apache.hadoop.mapred.JobConf,
 *      int)
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  InputSplit[] splits = super.getSplits(job, numSplits);
  log.debug("Returning {} splits", splits.length);
  return splits;
}

/**
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(org.apache.hadoop.mapred.JobConf,
 *      int)
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
  InputSplit[] splits = super.getSplits(job, numSplits);
  log.debug("Returning {} splits", splits.length);
  return splits;
}

private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass,
    String inputPath, int numSplits) throws ClassNotFoundException, IOException {
  // Create a new instance of the input format
  FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(
      Class.forName(inputFormatClass), conf);
  // Set the input path for the left data set
  inputFormat.setInputPaths(conf, inputPath);
  // Get the left input splits
  return inputFormat.getSplits(conf, numSplits);
}

@Override
public org.apache.hadoop.mapred.InputSplit[] getSplits(  // MRv1
    org.apache.hadoop.mapred.JobConf arg0,
    int arg1) throws IOException {
  ifmtMrv1 = (ifmtMrv1!=null) ? ifmtMrv1 :
   new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() {
    // Dummy method to satisfy interface but not meant to be called
    public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader(
     org.apache.hadoop.mapred.InputSplit ign0,
     org.apache.hadoop.mapred.JobConf ign1,
     org.apache.hadoop.mapred.Reporter ign2) throws IOException {
      throw new UnsupportedOperationException("not meant to be called");
    }
   };
  return ifmtMrv1.getSplits(arg0, arg1);
}

  @Override
  public org.apache.hadoop.mapred.InputSplit[] getSplits(  // MRv1
      org.apache.hadoop.mapred.JobConf arg0,
      int arg1) throws IOException {
    baseIfmt = (baseIfmt!=null) ? baseIfmt :
     new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() {
      // Dummy method to satisfy interface but not meant to be called
      public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader(
       org.apache.hadoop.mapred.InputSplit ign0,
       org.apache.hadoop.mapred.JobConf ign1,
       org.apache.hadoop.mapred.Reporter ign2) throws IOException {
        throw new UnsupportedOperationException("not meant to be called");
      }
     };
    return baseIfmt.getSplits(arg0, arg1);
  }
}

@Override
public org.apache.hadoop.mapred.InputSplit[] getSplits(  // MRv1
    org.apache.hadoop.mapred.JobConf arg0,
    int arg1) throws IOException {
  ifmtMrv1 = (ifmtMrv1!=null) ? ifmtMrv1 :
   new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() {
    @Override
    public boolean isSplitable(FileSystem fs, Path filename) {
      return false;
    }
    // Dummy method to satisfy interface but not meant to be called
    public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader(
     org.apache.hadoop.mapred.InputSplit ign0,
     org.apache.hadoop.mapred.JobConf ign1,
     org.apache.hadoop.mapred.Reporter ign2) throws IOException {
      throw new UnsupportedOperationException("not meant to be called");
    }
   };
  return ifmtMrv1.getSplits(arg0, arg1);
}

@Override
public org.apache.hadoop.mapred.InputSplit[] getSplits(  // MRv1
    org.apache.hadoop.mapred.JobConf arg0,
    int arg1) throws IOException {
  ifmtMrv1 = (ifmtMrv1!=null) ? ifmtMrv1 :
   new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() {
    @Override
    public boolean isSplitable(FileSystem fs, Path filename) {
      return false;
    }
    // Dummy method to satisfy interface but not meant to be called
    public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader(
     org.apache.hadoop.mapred.InputSplit ign0,
     org.apache.hadoop.mapred.JobConf ign1,
     org.apache.hadoop.mapred.Reporter ign2) throws IOException {
      throw new UnsupportedOperationException("not meant to be called");
    }
   };
  return ifmtMrv1.getSplits(arg0, arg1);
}

/**
 * {@inheritDoc}
 *
 * More exactly, the returned array type is CombineFileSplit[]. See { {@link
 * #readEntries(JsonReader, JobConf)}.
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplitsHint) throws IOException {
 List<InputSplit> splits = getSplitsFromManifest(job);
 if (splits == null) {
  /*
   * In the case of no manifest file, we fall back to the built-in
   * FileInputFormat.getSplits() to generate one split for each S3
   * file. Note that our record reader doesn't support byte offsets
   * into S3 files, so we need to override isSplitable(FileSystem,
   * Path) to always return false.
   */
  return super.getSplits(job, numSplitsHint);
 }
 log.info("The actual number of generated splits: " + splits.size());
 return splits.toArray(new InputSplit[splits.size()]);
}

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException {
  LOG.info(format("Opening '%s'", inputPath));
  Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList.get(idx);
  try {
    FileInputFormat.setInputPaths(job, inputPath);
    FileInputFormat<?, ?> instance = cls.newInstance();
    if (instance instanceof JobConfigurable) {
      ((JobConfigurable) instance).configure(job);
    }
    InputSplit[] splits = instance.getSplits(job, 1);
    if (1 != splits.length) {
      throw new IllegalArgumentException("Could not get input splits: " + inputPath);
    }
    return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
  } catch (RuntimeException e) {
    throw e;
  } catch (IOException e) {
    throw e;
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 if (isTaskSideMetaData(job)) {
  return super.getSplits(job, numSplits);
 }
 List<Footer> footers = getFooters(job);
 List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
 if (splits == null) {
  return null;
 }
 InputSplit[] resultSplits = new InputSplit[splits.size()];
 int i = 0;
 for (ParquetInputSplit split : splits) {
  resultSplits[i++] = new ParquetInputSplitWrapper(split);
 }
 return resultSplits;
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 if (isTaskSideMetaData(job)) {
  return super.getSplits(job, numSplits);
 }
 List<Footer> footers = getFooters(job);
 List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
 if (splits == null) {
  return null;
 }
 InputSplit[] resultSplits = new InputSplit[splits.size()];
 int i = 0;
 for (ParquetInputSplit split : splits) {
  resultSplits[i++] = new ParquetInputSplitWrapper(split);
 }
 return resultSplits;
}

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
 if (isTaskSideMetaData(job)) {
  return super.getSplits(job, numSplits);
 }
 List<Footer> footers = getFooters(job);
 List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers);
 if (splits == null) {
  return null;
 }
 InputSplit[] resultSplits = new InputSplit[splits.size()];
 int i = 0;
 for (ParquetInputSplit split : splits) {
  resultSplits[i++] = new ParquetInputSplitWrapper(split);
 }
 return resultSplits;
}

JobConf temp = new JobConf(job);
setInputPaths(temp, inputFiles[i_file]);
inputSplits[i_file] = super.getSplits(temp, 1);

InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt(
  HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
if (fileSplits == null || fileSplits.length == 0) {

@SuppressWarnings("rawtypes")
public void testLastInputSplitAtSplitBoundary() throws Exception {
 FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024,
   128l * 1024 * 1024);
 JobConf job = new JobConf();
 InputSplit[] splits = fif.getSplits(job, 8);
 assertEquals(8, splits.length);
 for (int i = 0; i < splits.length; i++) {
  InputSplit split = splits[i];
  assertEquals(("host" + i), split.getLocations()[0]);
 }
}

@SuppressWarnings("rawtypes")
public void testLastInputSplitExceedingSplitBoundary() throws Exception {
 FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024,
   128l * 1024 * 1024);
 JobConf job = new JobConf();
 InputSplit[] splits = fif.getSplits(job, 8);
 assertEquals(8, splits.length);
 for (int i = 0; i < splits.length; i++) {
  InputSplit split = splits[i];
  assertEquals(("host" + i), split.getLocations()[0]);
 }
}

@SuppressWarnings("rawtypes")
public void testLastInputSplitSingleSplit() throws Exception {
 FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024,
   128l * 1024 * 1024);
 JobConf job = new JobConf();
 InputSplit[] splits = fif.getSplits(job, 1);
 assertEquals(1, splits.length);
 for (int i = 0; i < splits.length; i++) {
  InputSplit split = splits[i];
  assertEquals(("host" + i), split.getLocations()[0]);
 }
}

Javadoc

Splits files returned by #listStatus(JobConf) when they're too big.

Popular methods of FileInputFormat

setInputPaths
Set the array of Paths as the list of inputs for the map-reduce job.
getInputPaths
Get the list of input Paths for the map-reduce job.
addInputPath
Add a Path to the list of inputs for the map-reduce job.
addInputPaths
Add the given comma separated paths to the list of inputs for the map-reduce job.
listStatus
List input directories. Subclasses may override to, e.g., select only files matching a regular expre
getRecordReader
isSplitable
Is the given filename splitable? Usually, true, but if the file is stream compressed, it will not be
computeSplitSize
getBlockIndex
getInputPathFilter
Get a PathFilter instance of the filter set for the input paths.
getPathStrings
addInputPathRecursively
Add files in the input path recursively into the results.

Popular in Java

Making http requests using okhttp
getSharedPreferences (Context)
setScale (BigDecimal)
runOnUiThread (Activity)
Charset (java.nio.charset)
A charset is a named mapping between Unicode characters and byte sequences. Every Charset can decode
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
ThreadPoolExecutor (java.util.concurrent)
An ExecutorService that executes each submitted task using one of possibly several pooled threads, n
Top Sublime Text plugins

How to use getSplitsmethodin org.apache.hadoop.mapred.FileInputFormat

Best Java code snippets using org.apache.hadoop.mapred.FileInputFormat.getSplits (Showing top 20 results out of 315)

How to use
getSplits
method
in
org.apache.hadoop.mapred.FileInputFormat