org.apache.hadoop.mapreduce.lib.input.CombineFileSplit java code examples

Refine search

public CustomRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) 
 throws IOException{
 path = split.getPath(index);
 fs = path.getFileSystem(context.getConfiguration());
 startOffset = split.getOffset(index);
 endOffset = startOffset + split.getLength(index);
 fileIn = fs.open(path);
 reader = new LineReader(fileIn);
 pos = startOffset;
}

/**
 * Set the number of locations in the split to SPLIT_MAX_NUM_LOCATIONS if it is larger than
 * SPLIT_MAX_NUM_LOCATIONS (MAPREDUCE-5186).
 */
private static List<InputSplit> cleanSplits(List<InputSplit> splits) throws IOException {
 if (VersionInfo.getVersion().compareTo("2.3.0") >= 0) {
  // This issue was fixed in 2.3.0, if newer version, no need to clean up splits
  return splits;
 }
 List<InputSplit> cleanedSplits = Lists.newArrayList();
 for (int i = 0; i < splits.size(); i++) {
  CombineFileSplit oldSplit = (CombineFileSplit) splits.get(i);
  String[] locations = oldSplit.getLocations();
  Preconditions.checkNotNull(locations, "CombineFileSplit.getLocations() returned null");
  if (locations.length > SPLIT_MAX_NUM_LOCATIONS) {
   locations = Arrays.copyOf(locations, SPLIT_MAX_NUM_LOCATIONS);
  }
  cleanedSplits.add(new CombineFileSplit(oldSplit.getPaths(), oldSplit.getStartOffsets(), oldSplit.getLengths(),
    locations));
 }
 return cleanedSplits;
}

@Override
public void initialize(InputSplit unusedSplit, TaskAttemptContext cx) throws IOException, InterruptedException {
 super.initialize(
   new FileSplit(this.split.getPath(this.idx), this.split.getOffset(this.idx), this.split.getLength(this.idx),
     null), cx);
}

Configuration conf = new Configuration();
conf.set("fs.default.name", namenodeURL);
FileSystem fs = FileSystem.get(conf);
 String fullTable = RowInputFormat.getFullyQualifiedTableName(table);
 String folder = HdfsRegionManager.getRegionFolder(Misc.getRegionPath(fullTable));
 RemoteIterator<LocatedFileStatus> fileItr = fs.listFiles(new Path(homeDir + "/" + folder), true);
 conf.set(RowInputFormat.HOME_DIR, homeDir);
 conf.set(RowInputFormat.INPUT_TABLE, table);
  LocatedFileStatus file = fileItr.next();
  Path path = file.getPath();
  if(!path.getName().endsWith("hop")) {
   continue;
  CombineFileSplit split = new CombineFileSplit(new Path[] { path } , new long[] { file.getLen()});
  RowRecordReader reader = new RowRecordReader();
  reader.initialize(split, context);

String testName = "testMissingBlocks";
try {
 Configuration conf = new Configuration();
 conf.set("fs.hdfs.impl", MissingBlockFileSystem.class.getName());
 conf.setBoolean("dfs.replication.considerLoad", false);
 dfs = new MiniDFSCluster.Builder(conf).racks(rack1).hosts(hosts1)
   .build();
  throw new IOException("Mkdirs failed to create " + inDir.toString());
 Path file1 = new Path(dir1 + "/file1");
 writeFile(conf, file1, (short)1, 1);
 Path file5 = new Path(dir5 + "/file5");
 writeFile(conf, file5, (short)1, 1);
 assertEquals(2, fileSplit.getNumPaths());
 assertEquals(1, fileSplit.getLocations().length);
 assertEquals(file1.getName(), fileSplit.getPath(0).getName());
 assertEquals(0, fileSplit.getOffset(0));
 assertEquals(BLOCKSIZE, fileSplit.getLength(0));
 assertEquals(file5.getName(), fileSplit.getPath(1).getName());
 assertEquals(0, fileSplit.getOffset(1));
 assertEquals(BLOCKSIZE, fileSplit.getLength(1));
 assertEquals(hosts1[0], fileSplit.getLocations()[0]);

 curReader = null;
 if (idx > 0) {
  progress += split.getLength(idx-1);    // done processing so far
if (idx == split.getNumPaths()) {
 return false;
 Configuration conf = context.getConfiguration();
 conf.set(MRJobConfig.MAP_INPUT_FILE, split.getPath(idx).toString());
 conf.setLong(MRJobConfig.MAP_INPUT_START, split.getOffset(idx));
 conf.setLong(MRJobConfig.MAP_INPUT_PATH, split.getLength(idx));

/**
 * Test when input files are from non-default file systems
 */
@Test
public void testForNonDefaultFileSystem() throws Throwable {
 Configuration conf = new Configuration();
 // use a fake file system scheme as default
 conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, DUMMY_FS_URI);
 // default fs path
 assertEquals(DUMMY_FS_URI, FileSystem.getDefaultUri(conf).toString());
 // add a local file
 Path localPath = new Path("testFile1");
 FileSystem lfs = FileSystem.getLocal(conf);
 FSDataOutputStream dos = lfs.create(localPath);
 dos.writeChars("Local file for CFIF");
 dos.close();
 Job job = Job.getInstance(conf);
 FileInputFormat.setInputPaths(job, lfs.makeQualified(localPath));
 DummyInputFormat inFormat = new DummyInputFormat();
 List<InputSplit> splits = inFormat.getSplits(job);
 assertTrue(splits.size() > 0);
 for (InputSplit s : splits) {
  CombineFileSplit cfs = (CombineFileSplit)s;
  for (Path p : cfs.getPaths()) {
   assertEquals(p.toUri().getScheme(), "file");
  }
 }
}

/**
 * Test when the input file's length is 0.
 */
@Test
public void testForEmptyFile() throws Exception {
 Configuration conf = new Configuration();
 FileSystem fileSys = FileSystem.get(conf);
 Path file = new Path("test" + "/file");
 FSDataOutputStream out = fileSys.create(file, true,
   conf.getInt("io.file.buffer.size", 4096), (short) 1, (long) BLOCKSIZE);
 out.write(new byte[0]);
 out.close();
 // split it using a CombinedFile input format
 DummyInputFormat inFormat = new DummyInputFormat();
 Job job = Job.getInstance(conf);
 FileInputFormat.setInputPaths(job, "test");
 List<InputSplit> splits = inFormat.getSplits(job);
 assertEquals(1, splits.size());
 CombineFileSplit fileSplit = (CombineFileSplit) splits.get(0);
 assertEquals(1, fileSplit.getNumPaths());
 assertEquals(file.getName(), fileSplit.getPath(0).getName());
 assertEquals(0, fileSplit.getOffset(0));
 assertEquals(0, fileSplit.getLength(0));
 fileSys.delete(file.getParent(), true);
}

public void initialize(InputSplit split, Configuration conf) throws IOException {
 this.conf = conf;
 String datasetName = conf.get("dataset");
 if (datasetName == null)
  throw new RuntimeException("Dataset name should be provided");
 if (split instanceof CombineFileSplit) {
  CombineFileSplit csplits = (CombineFileSplit) split;
  splits = new Vector<FileSplit>(csplits.getNumPaths());
  for (int i = 0; i < csplits.getNumPaths(); i++) {
   FileSplit fsplit = new FileSplit(csplits.getPath(i),
     csplits.getOffset(i), csplits.getLength(i), csplits.getLocations());
   splits.add(fsplit);
 fs = inFile.getFileSystem(conf);
 if (fs instanceof HTTPFileSystem) {
  inFile = new Path(FileUtil.copyFile(conf, inFile));
  fs = FileSystem.getLocal(conf);
  this.deleteOnEnd = true;
  throw new RuntimeException("Dataset '"+datasetName+"' not found in file: "+inFile.getName());
 boolean fillValueFound = false;
 int resolution = 0;
  skipFillValue = false;
 } else {
  skipFillValue = conf.getBoolean("skipfill", true);

MiniDFSCluster dfs = null;
try {
 Configuration conf = new Configuration();
 dfs = new MiniDFSCluster.Builder(conf).racks(rack1).hosts(hosts1)
   .build();
 Path dir1 = new Path("/dir1");
 Path file = new Path("/dir1/file1");
 Path dir2 = new Path("/dir1/dir2");
 if (!fileSys.mkdirs(dir1)) {
  throw new IOException("Mkdirs failed to create " + dir1.toString());
 assertEquals(1, fileSplit.getNumPaths());
 assertEquals(file.getName(), fileSplit.getPath(0).getName());
 assertEquals(0, fileSplit.getOffset(0));
 assertEquals(0, fileSplit.getLength(0));
} finally {
 if (dfs != null) {

conf.set(GFInputFormat.INPUT_REGION, getName());
conf.set(GFInputFormat.HOME_DIR, testDataDir.getName());
conf.setBoolean(GFInputFormat.CHECKPOINT, false);
for (InputSplit inputSplit : splits) {
 CombineFileSplit split = (CombineFileSplit) inputSplit;
 assertEquals(1, split.getPaths().length);
 assertEquals(lastBytePositionOfPrevious, split.getOffset(0));
 lastBytePositionOfPrevious += split.getLength();
 assertEquals(1, split.getLocations().length);
Path bucketPath = new Path(regionPath, "1");
Path hopPath = new Path(bucketPath, bucket1.getSortedOplogs().iterator()
  .next().get().getFileName());
FileStatus status = hdfsStore.getFileSystem().getFileStatus(hopPath);

Configuration conf = new Configuration();
TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId);
Path [] files = { new Path("file1"), new Path("file2") };
long [] lengths = { 1, 1 };
CombineFileSplit split = new CombineFileSplit(files, lengths);
RecordReader rr = inputFormat.createRecordReader(split, context);
assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);

  File dir = new File(outDir.toString());
  dir.mkdir();
  files[i] = new File(dir,"testfile"+i);
  fileWriter.close();
  fileLength[i] = i;
  paths[i] = new Path(outDir+"/testfile"+i);
 CombineFileSplit combineFileSplit = new CombineFileSplit(paths, fileLength);
 TaskAttemptID taskAttemptID = Mockito.mock(TaskAttemptID.class);
 TaskReporter reporter = Mockito.mock(TaskReporter.class);
 verify(reporter, times(3)).progress();
} finally {
 FileUtil.fullyDelete(new File(outDir.toString()));

/**
 * Implementation detail: This constructor is built to be called via
 * reflection from within CombineFileRecordReader.
 *
 * @param fileSplit The CombineFileSplit that this will read from.
 * @param context The context for this task.
 * @param pathToProcess The path index from the CombineFileSplit to process in this record.
 */
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context,
  Integer pathToProcess) {
 mProcessed = false;
 mFileToRead = fileSplit.getPath(pathToProcess);
 mFileLength = fileSplit.getLength(pathToProcess);
 mConf = context.getConfiguration();
 assert 0 == fileSplit.getOffset(pathToProcess);
 if (LOG.isDebugEnabled()) {
  LOG.debug("FileToRead is: " + mFileToRead.toString());
  LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths());
  try {
   final FileSystem fs = mFileToRead.getFileSystem(mConf);
   assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
  } catch (IOException ioe) {
   // oh well, I was just testing.
  }
 }
 mFileName = new Text();
 mFileText = new Text();
}

public ParserPump(CombineFileSplit split, TaskAttemptContext context) {
  this.context = context;
  this.paths = split.getPaths();
  this.sizes = split.getLengths();
  this.offsets = split.getStartOffsets();
  this.size = split.getLength();
  Configuration conf = context.getConfiguration();
  this.skipInvalid = conf.getBoolean(SKIP_INVALID_PROPERTY, false);
  this.verifyDataTypeValues = conf.getBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, false);
  this.overrideRdfContext = conf.getBoolean(OVERRIDE_CONTEXT_PROPERTY, false);
  this.defaultRdfContextPattern = conf.get(DEFAULT_CONTEXT_PROPERTY);
  this.maxSize = MAX_SINGLE_FILE_MULTIPLIER * conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0);
}

protected CombineFileRecordReaderWrapper(FileInputFormat<K,V> inputFormat,
 CombineFileSplit split, TaskAttemptContext context, Integer idx)
 throws IOException, InterruptedException {
 fileSplit = new FileSplit(split.getPath(idx),
  split.getOffset(idx),
  split.getLength(idx),
  split.getLocations());
 delegate = inputFormat.createRecordReader(fileSplit, context);
}

conf.set(GFInputFormat.INPUT_REGION, getName());
conf.set(GFInputFormat.HOME_DIR, testDataDir.getName());
conf.setBoolean(GFInputFormat.CHECKPOINT, false);
  split.getOffset(0), split.getLength(0));

@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
 CombineFileSplit cSplit = (CombineFileSplit) split;
 Path[] path = cSplit.getPaths();
 long[] start = cSplit.getStartOffsets();
 long[] len = cSplit.getLengths();
 Configuration conf = context.getConfiguration();
 FileSystem fs = cSplit.getPath(0).getFileSystem(conf);
 
 this.splitIterator = HDFSSplitIterator.newInstance(fs, path, start, len, 0l, 0l);
}

conf.set(GFInputFormat.INPUT_REGION, getName());
conf.set(GFInputFormat.HOME_DIR, testDataDir.getName());
conf.setBoolean(GFInputFormat.CHECKPOINT, false);
assertEquals(bucketCount, split.getNumPaths());

BasicPartitionedFileSetInputContext(MultiInputTaggedSplit multiInputTaggedSplit) {
 super(multiInputTaggedSplit.getName());
 InputSplit inputSplit = multiInputTaggedSplit.getInputSplit();
 if (inputSplit instanceof FileSplit) {
  isCombineInputFormat = false;
  Path path = ((FileSplit) inputSplit).getPath();
  inputPaths = new Path[] { path };
 } else if (inputSplit instanceof CombineFileSplit) {
  isCombineInputFormat = true;
  inputPaths = ((CombineFileSplit) inputSplit).getPaths();
 } else {
  throw new IllegalArgumentException(String.format("Expected either a '%s' or a '%s', but got '%s'.",
                           FileSplit.class.getName(), CombineFileSplit.class.getName(),
                           inputSplit.getClass().getName()));
 }
 this.conf = multiInputTaggedSplit.getConf();
 String mappingString = conf.get(PartitionedFileSetDataset.PATH_TO_PARTITIONING_MAPPING);
 this.pathToPartitionMapping =
  GSON.fromJson(Objects.requireNonNull(mappingString), STRING_PARTITION_KEY_MAP_TYPE);
}

Javadoc

A sub-collection of input files. Unlike FileSplit, CombineFileSplit class does not represent a split of a file, but a split of input files into smaller sets. A split may contain blocks from different file but all the blocks in the same split are probably local to some rack
CombineFileSplit can be used to implement RecordReader's, with reading one record per file.

Most used methods

getPath
Returns the ith Path
getLength
Returns the length of the ith Path
getLocations
Returns all the Paths where this input-split resides
getOffset
Returns the start offset of the ith Path
<init>
Specify the set of files to include in this split
getPaths
Returns all the Paths in the split
getLengths
Returns an array containing the lengths of the files in the split
getStartOffsets
Returns an array containing the start offsets of the files in the split
getNumPaths
Returns the number of Paths in the split
initSplit
readFields
write

Popular in Java

Start an intent from android
setScale (BigDecimal)
startActivity (Activity)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
ResultSet (java.sql)
An interface for an object which represents a database table entry, returned as the result of the qu
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
GridBagLayout (java.awt)
The GridBagLayout class is a flexible layout manager that aligns components vertically and horizonta
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top Sublime Text plugins

How to useCombineFileSplit in org.apache.hadoop.mapreduce.lib.input

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.CombineFileSplit (Showing top 20 results out of 333)

Refine search

How to use
CombineFileSplit
in
org.apache.hadoop.mapreduce.lib.input