@Override public void open(int taskNumber, int numTasks) throws IOException { super.open(taskNumber, numTasks); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? this.outputFilePath.getFileSystem().getDefaultBlockSize() : this.blockSize; this.blockBasedInput = new BlockBasedOutput(this.stream, (int) blockSize); this.dataOutputStream = new DataOutputStream(this.blockBasedInput); }
@Override public void open(int taskNumber, int numTasks) throws IOException { super.open(taskNumber, numTasks); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? this.outputFilePath.getFileSystem().getDefaultBlockSize() : this.blockSize; this.blockBasedInput = new BlockBasedOutput(this.stream, (int) blockSize); this.dataOutputStream = new DataOutputStream(this.blockBasedInput); }
private List<FileStatus> getFiles() throws IOException { // get all the files that are involved in the splits final List<FileStatus> files = new ArrayList<FileStatus>(); final FileSystem fs = this.filePath.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(this.filePath); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] partials = fs.listStatus(this.filePath); for (int i = 0; i < partials.length; i++) if (!partials[i].isDir()) files.add(partials[i]); } else files.add(pathFile); return files; }
protected List<FileStatus> getFiles() throws IOException { // get all the files that are involved in the splits List<FileStatus> files = new ArrayList<FileStatus>(); final FileSystem fs = this.filePath.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(this.filePath); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] partials = fs.listStatus(this.filePath); for (int i = 0; i < partials.length; i++) { if (!partials[i].isDir()) { files.add(partials[i]); } } } else { files.add(pathFile); } return files; }
@Override public int getMaximumNumberOfSubtasks(final AbstractInvokable invokable) { int numberOfBlocks = -1; if (this.path == null) { return -1; } try { final FileSystem fs = this.path.getFileSystem(); final FileStatus f = fs.getFileStatus(this.path); numberOfBlocks = fs.getNumberOfBlocks(f); } catch (IOException e) { return -1; } return (int) Math.min(numberOfBlocks, invokable.getMaximumNumberOfSubtasks()); } }
@Override public int getMaximumNumberOfSubtasks(final AbstractInvokable invokable) { int numberOfBlocks = -1; if (this.path == null) { return -1; } try { final FileSystem fs = this.path.getFileSystem(); final FileStatus f = fs.getFileStatus(this.path); numberOfBlocks = fs.getNumberOfBlocks(f); } catch (IOException e) { return -1; } return (int) Math.min(numberOfBlocks, invokable.getMaximumNumberOfSubtasks()); } }
protected List<FileStatus> getFiles() throws IOException { // get all the files that are involved in the splits List<FileStatus> files = new ArrayList<FileStatus>(); final FileSystem fs = this.filePath.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(this.filePath); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] partials = fs.listStatus(this.filePath); for (int i = 0; i < partials.length; i++) { if (!partials[i].isDir()) { files.add(partials[i]); } } } else { files.add(pathFile); } return files; }
@Override public void open(final int taskNumber) throws IOException { super.open(taskNumber); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? this.outputFilePath.getFileSystem().getDefaultBlockSize() : this.blockSize; this.blockBasedInput = new BlockBasedOutput(this.stream, (int) blockSize); this.dataOutputStream = new DataOutputViewStream(this.blockBasedInput); }
@Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { List<FileStatus> files = this.getFiles(); final FileSystem fs = this.filePath.getFileSystem(); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize; final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits); for (FileStatus file : files) { long splitSize = blockSize; for (long pos = 0, length = file.getLen(); pos < length; pos += splitSize) { long remainingLength = Math.min(pos + splitSize, length) - pos; // get the block locations and make sure they are in order with respect to their offset final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength); Arrays.sort(blocks); inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts())); } } if (inputSplits.size() < minNumSplits) { LOG.warn(String.format( "With the given block size %d, the file %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, this.filePath, minNumSplits)); FileStatus last = files.get(files.size() - 1); final BlockLocation[] blocks = fs.getFileBlockLocations(last, 0, last.getLen()); for (int index = files.size(); index < minNumSplits; index++) { inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts())); } } return inputSplits.toArray(new FileInputSplit[0]); }
public Path call() { Path tmp = getTempDir(jobID, name); try { if (!lfs.exists(tmp)) { FSDataOutputStream lfsOutput = lfs.create(tmp, false); Path distributedPath = new Path(filePath); FileSystem fs = distributedPath.getFileSystem(); FSDataInputStream fsInput = fs.open(distributedPath); IOUtils.copyBytes(fsInput, lfsOutput); } } catch (IOException e1) { throw new RuntimeException("Error copying a file from hdfs to the local fs", e1); } return tmp; } }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? this.filePath.getFileSystem().getDefaultBlockSize() : this.blockSize; this.blockInfo = this.createBlockInfo(); if (this.splitLength > this.blockInfo.getInfoSize()) { // TODO: seek not supported by compressed streams. Will throw exception this.stream.seek(this.splitStart + this.splitLength - this.blockInfo.getInfoSize()); DataInputStream infoStream = new DataInputStream(this.stream); this.blockInfo.read(infoStream); } this.stream.seek(this.splitStart + this.blockInfo.getFirstRecordStart()); this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockSize); this.dataInputStream = new DataInputStream(this.blockBasedInput); this.readRecords = 0; }
@Override public void open(FileInputSplit split) throws IOException { super.open(split); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? this.filePath.getFileSystem().getDefaultBlockSize() : this.blockSize; this.blockInfo = this.createBlockInfo(); if (this.splitLength > this.blockInfo.getInfoSize()) { // TODO: seek not supported by compressed streams. Will throw exception this.stream.seek(this.splitStart + this.splitLength - this.blockInfo.getInfoSize()); DataInputStream infoStream = new DataInputStream(this.stream); this.blockInfo.read(infoStream); } this.stream.seek(this.splitStart + this.blockInfo.getFirstRecordStart()); this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockSize); this.dataInputStream = new DataInputStream(this.blockBasedInput); this.readRecords = 0; }
/** * Set minNumSplits to number of files. */ @Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { int numAvroFiles = 0; final Path path = this.filePath; // get all the files that are involved in the splits final FileSystem fs = path.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(path); if (!acceptFile(pathFile)) { throw new IOException("The given file does not pass the file-filter"); } if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] dir = fs.listStatus(path); for (int i = 0; i < dir.length; i++) { if (!dir[i].isDir() && acceptFile(dir[i])) { numAvroFiles++; } } } else { numAvroFiles = 1; } return super.createInputSplits(numAvroFiles); }
@Override public boolean preVisit(final Operator visitable) { int degree = GenericTestPlan.this.getDegreeOfParallelism(); if (visitable instanceof GenericDataSource<?>) degree = 1; else if (degree > 1 && visitable instanceof FileDataSink) try { final Path path = new Path(((FileDataSink) visitable).getFilePath()); final FileSystem fs = path.getFileSystem(); final FileStatus f = fs.getFileStatus(path); if (!f.isDir()) { fs.delete(path, false); fs.mkdirs(path); } } catch (final IOException e) { e.printStackTrace(); } if (visitable.getDegreeOfParallelism() == -1) visitable.setDegreeOfParallelism(degree); return true; } });
@Override public void checkConfiguration(final AbstractInvokable invokable) throws IllegalConfigurationException { // Check if the user has specified a path if (this.path == null) { throw new IllegalConfigurationException(this.getName() + " does not specify an input path"); } // Check if the path is valid try { final FileSystem fs = this.path.getFileSystem(); final FileStatus f = fs.getFileStatus(this.path); if (f == null) { throw new IOException(this.path.toString() + " led to a null object"); } } catch (IOException e) { throw new IllegalConfigurationException("Cannot access file or directory: " + StringUtils.stringifyException(e)); } // register the path in the configuration invokable.getTaskConfiguration() .setString(AbstractFileInputTask.INPUT_PATH_CONFIG_KEY, this.path.toString()); // Finally, see if the task itself has a valid configuration super.checkConfiguration(invokable); }
@Override public void open(final FileInputSplit split) throws IOException { super.open(split); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? this.filePath.getFileSystem().getDefaultBlockSize() : this.blockSize; this.blockInfo = this.createBlockInfo(); if (this.splitLength > this.blockInfo.getInfoSize()) { this.stream.seek(this.splitStart + this.splitLength - this.blockInfo.getInfoSize()); final DataInputStream infoStream = new DataInputStream(this.stream); this.blockInfo.read(infoStream); } this.stream.seek(this.splitStart + this.blockInfo.getFirstRecordStart()); this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockSize); this.dataInputStream = new DataInputViewStream(this.blockBasedInput); this.readRecords = 0; }
@Override public void checkConfiguration(final AbstractInvokable invokable) throws IllegalConfigurationException { // Check if the user has specified a path if (this.path == null) { throw new IllegalConfigurationException(this.getName() + " does not specify an input path"); } // Check if the path is valid try { final FileSystem fs = this.path.getFileSystem(); final FileStatus f = fs.getFileStatus(this.path); if (f == null) { throw new IOException(this.path.toString() + " led to a null object"); } } catch (IOException e) { throw new IllegalConfigurationException("Cannot access file or directory: " + StringUtils.stringifyException(e)); } // register the path in the configuration invokable.getTaskConfiguration() .setString(AbstractFileInputTask.INPUT_PATH_CONFIG_KEY, this.path.toString()); // Finally, see if the task itself has a valid configuration super.checkConfiguration(invokable); }
/** * Fill in the statistics. The last modification time and the total input size are prefilled. * * @param files * The files that are associated with this block input format. * @param stats * The pre-filled statistics. */ protected SequentialStatistics createStatistics(final List<FileStatus> files, final FileBaseStatistics stats) throws IOException { if (files.isEmpty()) return null; final BlockInfo blockInfo = this.createBlockInfo(); long totalCount = 0; for (final FileStatus file : files) { // invalid file if (file.getLen() < blockInfo.getInfoSize()) continue; final FSDataInputStream fdis = file.getPath().getFileSystem().open(file.getPath(), blockInfo.getInfoSize()); fdis.seek(file.getLen() - blockInfo.getInfoSize()); final DataInputStream input = new DataInputStream(fdis); blockInfo.read(input); totalCount += blockInfo.getAccumulatedRecordCount(); } final float avgWidth = totalCount == 0 ? 0 : (float) stats.getTotalInputSize() / totalCount; return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount); }
/** * Fill in the statistics. The last modification time and the total input size are prefilled. * * @param files * The files that are associated with this block input format. * @param stats * The pre-filled statistics. */ protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException { if (files.isEmpty()) { return null; } BlockInfo blockInfo = this.createBlockInfo(); long totalCount = 0; for (FileStatus file : files) { // invalid file if (file.getLen() < blockInfo.getInfoSize()) { continue; } FSDataInputStream fdis = file.getPath().getFileSystem().open(file.getPath(), blockInfo.getInfoSize()); fdis.seek(file.getLen() - blockInfo.getInfoSize()); DataInputStream input = new DataInputStream(fdis); blockInfo.read(input); totalCount += blockInfo.getAccumulatedRecordCount(); } final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount); return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount); }
/** * Fill in the statistics. The last modification time and the total input size are prefilled. * * @param files * The files that are associated with this block input format. * @param stats * The pre-filled statistics. */ protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats) throws IOException { if (files.isEmpty()) { return null; } BlockInfo blockInfo = this.createBlockInfo(); long totalCount = 0; for (FileStatus file : files) { // invalid file if (file.getLen() < blockInfo.getInfoSize()) { continue; } FSDataInputStream fdis = file.getPath().getFileSystem().open(file.getPath(), blockInfo.getInfoSize()); fdis.seek(file.getLen() - blockInfo.getInfoSize()); DataInputStream input = new DataInputStream(fdis); blockInfo.read(input); totalCount += blockInfo.getAccumulatedRecordCount(); } final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount); return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth, totalCount); }