Refine search
public CustomRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException{ path = split.getPath(index); fs = path.getFileSystem(context.getConfiguration()); startOffset = split.getOffset(index); endOffset = startOffset + split.getLength(index); fileIn = fs.open(path); reader = new LineReader(fileIn); pos = startOffset; }
/** * Set the number of locations in the split to SPLIT_MAX_NUM_LOCATIONS if it is larger than * SPLIT_MAX_NUM_LOCATIONS (MAPREDUCE-5186). */ private static List<InputSplit> cleanSplits(List<InputSplit> splits) throws IOException { if (VersionInfo.getVersion().compareTo("2.3.0") >= 0) { // This issue was fixed in 2.3.0, if newer version, no need to clean up splits return splits; } List<InputSplit> cleanedSplits = Lists.newArrayList(); for (int i = 0; i < splits.size(); i++) { CombineFileSplit oldSplit = (CombineFileSplit) splits.get(i); String[] locations = oldSplit.getLocations(); Preconditions.checkNotNull(locations, "CombineFileSplit.getLocations() returned null"); if (locations.length > SPLIT_MAX_NUM_LOCATIONS) { locations = Arrays.copyOf(locations, SPLIT_MAX_NUM_LOCATIONS); } cleanedSplits.add(new CombineFileSplit(oldSplit.getPaths(), oldSplit.getStartOffsets(), oldSplit.getLengths(), locations)); } return cleanedSplits; }
Configuration conf = new Configuration(); conf.set("fs.default.name", namenodeURL); FileSystem fs = FileSystem.get(conf); String fullTable = RowInputFormat.getFullyQualifiedTableName(table); String folder = HdfsRegionManager.getRegionFolder(Misc.getRegionPath(fullTable)); RemoteIterator<LocatedFileStatus> fileItr = fs.listFiles(new Path(homeDir + "/" + folder), true); conf.set(RowInputFormat.HOME_DIR, homeDir); conf.set(RowInputFormat.INPUT_TABLE, table); LocatedFileStatus file = fileItr.next(); Path path = file.getPath(); if(!path.getName().endsWith("hop")) { continue; CombineFileSplit split = new CombineFileSplit(new Path[] { path } , new long[] { file.getLen()}); RowRecordReader reader = new RowRecordReader(); reader.initialize(split, context);
String testName = "testMissingBlocks"; try { Configuration conf = new Configuration(); conf.set("fs.hdfs.impl", MissingBlockFileSystem.class.getName()); conf.setBoolean("dfs.replication.considerLoad", false); dfs = new MiniDFSCluster.Builder(conf).racks(rack1).hosts(hosts1) .build(); throw new IOException("Mkdirs failed to create " + inDir.toString()); Path file1 = new Path(dir1 + "/file1"); writeFile(conf, file1, (short)1, 1); Path file5 = new Path(dir5 + "/file5"); writeFile(conf, file5, (short)1, 1); assertEquals(2, fileSplit.getNumPaths()); assertEquals(1, fileSplit.getLocations().length); assertEquals(file1.getName(), fileSplit.getPath(0).getName()); assertEquals(0, fileSplit.getOffset(0)); assertEquals(BLOCKSIZE, fileSplit.getLength(0)); assertEquals(file5.getName(), fileSplit.getPath(1).getName()); assertEquals(0, fileSplit.getOffset(1)); assertEquals(BLOCKSIZE, fileSplit.getLength(1)); assertEquals(hosts1[0], fileSplit.getLocations()[0]);
curReader = null; if (idx > 0) { progress += split.getLength(idx-1); // done processing so far if (idx == split.getNumPaths()) { return false; Configuration conf = context.getConfiguration(); conf.set(MRJobConfig.MAP_INPUT_FILE, split.getPath(idx).toString()); conf.setLong(MRJobConfig.MAP_INPUT_START, split.getOffset(idx)); conf.setLong(MRJobConfig.MAP_INPUT_PATH, split.getLength(idx));
/** * Test when input files are from non-default file systems */ @Test public void testForNonDefaultFileSystem() throws Throwable { Configuration conf = new Configuration(); // use a fake file system scheme as default conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, DUMMY_FS_URI); // default fs path assertEquals(DUMMY_FS_URI, FileSystem.getDefaultUri(conf).toString()); // add a local file Path localPath = new Path("testFile1"); FileSystem lfs = FileSystem.getLocal(conf); FSDataOutputStream dos = lfs.create(localPath); dos.writeChars("Local file for CFIF"); dos.close(); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, lfs.makeQualified(localPath)); DummyInputFormat inFormat = new DummyInputFormat(); List<InputSplit> splits = inFormat.getSplits(job); assertTrue(splits.size() > 0); for (InputSplit s : splits) { CombineFileSplit cfs = (CombineFileSplit)s; for (Path p : cfs.getPaths()) { assertEquals(p.toUri().getScheme(), "file"); } } }
/** * Test when the input file's length is 0. */ @Test public void testForEmptyFile() throws Exception { Configuration conf = new Configuration(); FileSystem fileSys = FileSystem.get(conf); Path file = new Path("test" + "/file"); FSDataOutputStream out = fileSys.create(file, true, conf.getInt("io.file.buffer.size", 4096), (short) 1, (long) BLOCKSIZE); out.write(new byte[0]); out.close(); // split it using a CombinedFile input format DummyInputFormat inFormat = new DummyInputFormat(); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, "test"); List<InputSplit> splits = inFormat.getSplits(job); assertEquals(1, splits.size()); CombineFileSplit fileSplit = (CombineFileSplit) splits.get(0); assertEquals(1, fileSplit.getNumPaths()); assertEquals(file.getName(), fileSplit.getPath(0).getName()); assertEquals(0, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getLength(0)); fileSys.delete(file.getParent(), true); }
public void initialize(InputSplit split, Configuration conf) throws IOException { this.conf = conf; String datasetName = conf.get("dataset"); if (datasetName == null) throw new RuntimeException("Dataset name should be provided"); if (split instanceof CombineFileSplit) { CombineFileSplit csplits = (CombineFileSplit) split; splits = new Vector<FileSplit>(csplits.getNumPaths()); for (int i = 0; i < csplits.getNumPaths(); i++) { FileSplit fsplit = new FileSplit(csplits.getPath(i), csplits.getOffset(i), csplits.getLength(i), csplits.getLocations()); splits.add(fsplit); fs = inFile.getFileSystem(conf); if (fs instanceof HTTPFileSystem) { inFile = new Path(FileUtil.copyFile(conf, inFile)); fs = FileSystem.getLocal(conf); this.deleteOnEnd = true; throw new RuntimeException("Dataset '"+datasetName+"' not found in file: "+inFile.getName()); boolean fillValueFound = false; int resolution = 0; skipFillValue = false; } else { skipFillValue = conf.getBoolean("skipfill", true);
MiniDFSCluster dfs = null; try { Configuration conf = new Configuration(); dfs = new MiniDFSCluster.Builder(conf).racks(rack1).hosts(hosts1) .build(); Path dir1 = new Path("/dir1"); Path file = new Path("/dir1/file1"); Path dir2 = new Path("/dir1/dir2"); if (!fileSys.mkdirs(dir1)) { throw new IOException("Mkdirs failed to create " + dir1.toString()); assertEquals(1, fileSplit.getNumPaths()); assertEquals(file.getName(), fileSplit.getPath(0).getName()); assertEquals(0, fileSplit.getOffset(0)); assertEquals(0, fileSplit.getLength(0)); } finally { if (dfs != null) {
conf.set(GFInputFormat.INPUT_REGION, getName()); conf.set(GFInputFormat.HOME_DIR, testDataDir.getName()); conf.setBoolean(GFInputFormat.CHECKPOINT, false); for (InputSplit inputSplit : splits) { CombineFileSplit split = (CombineFileSplit) inputSplit; assertEquals(1, split.getPaths().length); assertEquals(lastBytePositionOfPrevious, split.getOffset(0)); lastBytePositionOfPrevious += split.getLength(); assertEquals(1, split.getLocations().length); Path bucketPath = new Path(regionPath, "1"); Path hopPath = new Path(bucketPath, bucket1.getSortedOplogs().iterator() .next().get().getFileName()); FileStatus status = hdfsStore.getFileSystem().getFileStatus(hopPath);
Configuration conf = new Configuration(); TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskId); Path [] files = { new Path("file1"), new Path("file2") }; long [] lengths = { 1, 1 }; CombineFileSplit split = new CombineFileSplit(files, lengths); RecordReader rr = inputFormat.createRecordReader(split, context); assertTrue("Unexpected RR type!", rr instanceof CombineFileRecordReader);
File dir = new File(outDir.toString()); dir.mkdir(); files[i] = new File(dir,"testfile"+i); fileWriter.close(); fileLength[i] = i; paths[i] = new Path(outDir+"/testfile"+i); CombineFileSplit combineFileSplit = new CombineFileSplit(paths, fileLength); TaskAttemptID taskAttemptID = Mockito.mock(TaskAttemptID.class); TaskReporter reporter = Mockito.mock(TaskReporter.class); verify(reporter, times(3)).progress(); } finally { FileUtil.fullyDelete(new File(outDir.toString()));
/** * Implementation detail: This constructor is built to be called via * reflection from within CombineFileRecordReader. * * @param fileSplit The CombineFileSplit that this will read from. * @param context The context for this task. * @param pathToProcess The path index from the CombineFileSplit to process in this record. */ public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context, Integer pathToProcess) { mProcessed = false; mFileToRead = fileSplit.getPath(pathToProcess); mFileLength = fileSplit.getLength(pathToProcess); mConf = context.getConfiguration(); assert 0 == fileSplit.getOffset(pathToProcess); if (LOG.isDebugEnabled()) { LOG.debug("FileToRead is: " + mFileToRead.toString()); LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths()); try { final FileSystem fs = mFileToRead.getFileSystem(mConf); assert fs.getFileStatus(mFileToRead).getLen() == mFileLength; } catch (IOException ioe) { // oh well, I was just testing. } } mFileName = new Text(); mFileText = new Text(); }
public ParserPump(CombineFileSplit split, TaskAttemptContext context) { this.context = context; this.paths = split.getPaths(); this.sizes = split.getLengths(); this.offsets = split.getStartOffsets(); this.size = split.getLength(); Configuration conf = context.getConfiguration(); this.skipInvalid = conf.getBoolean(SKIP_INVALID_PROPERTY, false); this.verifyDataTypeValues = conf.getBoolean(VERIFY_DATATYPE_VALUES_PROPERTY, false); this.overrideRdfContext = conf.getBoolean(OVERRIDE_CONTEXT_PROPERTY, false); this.defaultRdfContextPattern = conf.get(DEFAULT_CONTEXT_PROPERTY); this.maxSize = MAX_SINGLE_FILE_MULTIPLIER * conf.getLong("mapreduce.input.fileinputformat.split.maxsize", 0); }
protected CombineFileRecordReaderWrapper(FileInputFormat<K,V> inputFormat, CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException { fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx), split.getLocations()); delegate = inputFormat.createRecordReader(fileSplit, context); }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { CombineFileSplit cSplit = (CombineFileSplit) split; Path[] path = cSplit.getPaths(); long[] start = cSplit.getStartOffsets(); long[] len = cSplit.getLengths(); Configuration conf = context.getConfiguration(); FileSystem fs = cSplit.getPath(0).getFileSystem(conf); this.splitIterator = HDFSSplitIterator.newInstance(fs, path, start, len, 0l, 0l); }
conf.set(GFInputFormat.INPUT_REGION, getName()); conf.set(GFInputFormat.HOME_DIR, testDataDir.getName()); conf.setBoolean(GFInputFormat.CHECKPOINT, false); assertEquals(bucketCount, split.getNumPaths());
BasicPartitionedFileSetInputContext(MultiInputTaggedSplit multiInputTaggedSplit) { super(multiInputTaggedSplit.getName()); InputSplit inputSplit = multiInputTaggedSplit.getInputSplit(); if (inputSplit instanceof FileSplit) { isCombineInputFormat = false; Path path = ((FileSplit) inputSplit).getPath(); inputPaths = new Path[] { path }; } else if (inputSplit instanceof CombineFileSplit) { isCombineInputFormat = true; inputPaths = ((CombineFileSplit) inputSplit).getPaths(); } else { throw new IllegalArgumentException(String.format("Expected either a '%s' or a '%s', but got '%s'.", FileSplit.class.getName(), CombineFileSplit.class.getName(), inputSplit.getClass().getName())); } this.conf = multiInputTaggedSplit.getConf(); String mappingString = conf.get(PartitionedFileSetDataset.PATH_TO_PARTITIONING_MAPPING); this.pathToPartitionMapping = GSON.fromJson(Objects.requireNonNull(mappingString), STRING_PARTITION_KEY_MAP_TYPE); }