InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt( HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED)); if (fileSplits == null || fileSplits.length == 0) {
@Override public InputSplit[] getSplits(JobConf conf, int splits) throws IOException { if (conf == lastConf) { return lastResult; } lastConf = conf; lastResult = super.getSplits(conf, splits); return lastResult; } }
/** * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(org.apache.hadoop.mapred.JobConf, * int) */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { InputSplit[] splits = super.getSplits(job, numSplits); log.debug("Returning {} splits", splits.length); return splits; }
/** * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(org.apache.hadoop.mapred.JobConf, * int) */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { InputSplit[] splits = super.getSplits(job, numSplits); log.debug("Returning {} splits", splits.length); return splits; }
/** * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(org.apache.hadoop.mapred.JobConf, * int) */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { InputSplit[] splits = super.getSplits(job, numSplits); log.debug("Returning {} splits", splits.length); return splits; }
private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits) throws ClassNotFoundException, IOException { // Create a new instance of the input format FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance( Class.forName(inputFormatClass), conf); // Set the input path for the left data set inputFormat.setInputPaths(conf, inputPath); // Get the left input splits return inputFormat.getSplits(conf, numSplits); }
@Override public org.apache.hadoop.mapred.InputSplit[] getSplits( // MRv1 org.apache.hadoop.mapred.JobConf arg0, int arg1) throws IOException { ifmtMrv1 = (ifmtMrv1!=null) ? ifmtMrv1 : new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() { // Dummy method to satisfy interface but not meant to be called public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader( org.apache.hadoop.mapred.InputSplit ign0, org.apache.hadoop.mapred.JobConf ign1, org.apache.hadoop.mapred.Reporter ign2) throws IOException { throw new UnsupportedOperationException("not meant to be called"); } }; return ifmtMrv1.getSplits(arg0, arg1); }
@Override public org.apache.hadoop.mapred.InputSplit[] getSplits( // MRv1 org.apache.hadoop.mapred.JobConf arg0, int arg1) throws IOException { baseIfmt = (baseIfmt!=null) ? baseIfmt : new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() { // Dummy method to satisfy interface but not meant to be called public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader( org.apache.hadoop.mapred.InputSplit ign0, org.apache.hadoop.mapred.JobConf ign1, org.apache.hadoop.mapred.Reporter ign2) throws IOException { throw new UnsupportedOperationException("not meant to be called"); } }; return baseIfmt.getSplits(arg0, arg1); } }
@Override public org.apache.hadoop.mapred.InputSplit[] getSplits( // MRv1 org.apache.hadoop.mapred.JobConf arg0, int arg1) throws IOException { ifmtMrv1 = (ifmtMrv1!=null) ? ifmtMrv1 : new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() { @Override public boolean isSplitable(FileSystem fs, Path filename) { return false; } // Dummy method to satisfy interface but not meant to be called public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader( org.apache.hadoop.mapred.InputSplit ign0, org.apache.hadoop.mapred.JobConf ign1, org.apache.hadoop.mapred.Reporter ign2) throws IOException { throw new UnsupportedOperationException("not meant to be called"); } }; return ifmtMrv1.getSplits(arg0, arg1); }
@Override public org.apache.hadoop.mapred.InputSplit[] getSplits( // MRv1 org.apache.hadoop.mapred.JobConf arg0, int arg1) throws IOException { ifmtMrv1 = (ifmtMrv1!=null) ? ifmtMrv1 : new org.apache.hadoop.mapred.FileInputFormat<LongWritable,Text>() { @Override public boolean isSplitable(FileSystem fs, Path filename) { return false; } // Dummy method to satisfy interface but not meant to be called public org.apache.hadoop.mapred.RecordReader<LongWritable, Text> getRecordReader( org.apache.hadoop.mapred.InputSplit ign0, org.apache.hadoop.mapred.JobConf ign1, org.apache.hadoop.mapred.Reporter ign2) throws IOException { throw new UnsupportedOperationException("not meant to be called"); } }; return ifmtMrv1.getSplits(arg0, arg1); }
/** * {@inheritDoc} * * More exactly, the returned array type is CombineFileSplit[]. See { {@link * #readEntries(JsonReader, JobConf)}. */ @Override public InputSplit[] getSplits(JobConf job, int numSplitsHint) throws IOException { List<InputSplit> splits = getSplitsFromManifest(job); if (splits == null) { /* * In the case of no manifest file, we fall back to the built-in * FileInputFormat.getSplits() to generate one split for each S3 * file. Note that our record reader doesn't support byte offsets * into S3 files, so we need to override isSplitable(FileSystem, * Path) to always return false. */ return super.getSplits(job, numSplitsHint); } log.info("The actual number of generated splits: " + splits.size()); return splits.toArray(new InputSplit[splits.size()]); }
@SuppressWarnings("unchecked") private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { LOG.info(format("Opening '%s'", inputPath)); Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList.get(idx); try { FileInputFormat.setInputPaths(job, inputPath); FileInputFormat<?, ?> instance = cls.newInstance(); if (instance instanceof JobConfigurable) { ((JobConfigurable) instance).configure(job); } InputSplit[] splits = instance.getSplits(job, 1); if (1 != splits.length) { throw new IllegalArgumentException("Could not get input splits: " + inputPath); } return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { if (isTaskSideMetaData(job)) { return super.getSplits(job, numSplits); } List<Footer> footers = getFooters(job); List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers); if (splits == null) { return null; } InputSplit[] resultSplits = new InputSplit[splits.size()]; int i = 0; for (ParquetInputSplit split : splits) { resultSplits[i++] = new ParquetInputSplitWrapper(split); } return resultSplits; }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { if (isTaskSideMetaData(job)) { return super.getSplits(job, numSplits); } List<Footer> footers = getFooters(job); List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers); if (splits == null) { return null; } InputSplit[] resultSplits = new InputSplit[splits.size()]; int i = 0; for (ParquetInputSplit split : splits) { resultSplits[i++] = new ParquetInputSplitWrapper(split); } return resultSplits; }
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { if (isTaskSideMetaData(job)) { return super.getSplits(job, numSplits); } List<Footer> footers = getFooters(job); List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers); if (splits == null) { return null; } InputSplit[] resultSplits = new InputSplit[splits.size()]; int i = 0; for (ParquetInputSplit split : splits) { resultSplits[i++] = new ParquetInputSplitWrapper(split); } return resultSplits; }
JobConf temp = new JobConf(job); setInputPaths(temp, inputFiles[i_file]); inputSplits[i_file] = super.getSplits(temp, 1);
InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt( HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED)); if (fileSplits == null || fileSplits.length == 0) {
@SuppressWarnings("rawtypes") public void testLastInputSplitAtSplitBoundary() throws Exception { FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024, 128l * 1024 * 1024); JobConf job = new JobConf(); InputSplit[] splits = fif.getSplits(job, 8); assertEquals(8, splits.length); for (int i = 0; i < splits.length; i++) { InputSplit split = splits[i]; assertEquals(("host" + i), split.getLocations()[0]); } }
@SuppressWarnings("rawtypes") public void testLastInputSplitExceedingSplitBoundary() throws Exception { FileInputFormat fif = new FileInputFormatForTest(1027l * 1024 * 1024, 128l * 1024 * 1024); JobConf job = new JobConf(); InputSplit[] splits = fif.getSplits(job, 8); assertEquals(8, splits.length); for (int i = 0; i < splits.length; i++) { InputSplit split = splits[i]; assertEquals(("host" + i), split.getLocations()[0]); } }
@SuppressWarnings("rawtypes") public void testLastInputSplitSingleSplit() throws Exception { FileInputFormat fif = new FileInputFormatForTest(100l * 1024 * 1024, 128l * 1024 * 1024); JobConf job = new JobConf(); InputSplit[] splits = fif.getSplits(job, 1); assertEquals(1, splits.length); for (int i = 0; i < splits.length; i++) { InputSplit split = splits[i]; assertEquals(("host" + i), split.getLocations()[0]); } }