latestModTime = Math.max(latestModTime, file.getModificationTime()); if (file.isDir()) { FileStatus[] fss = fs.listStatus(filePath); files.ensureCapacity(files.size() + fss.length); if (!s.isDir()) { files.add(s); latestModTime = Math.max(s.getModificationTime(), latestModTime); len += s.getLen();
private static void internalCopyDirectory(Path sourcePath, Path targetPath, boolean executable, FileSystem sFS, FileSystem tFS) throws IOException { tFS.mkdirs(targetPath); FileStatus[] contents = sFS.listStatus(sourcePath); for (FileStatus content : contents) { String distPath = content.getPath().toString(); if (content.isDir()) { if (distPath.endsWith("/")) { distPath = distPath.substring(0, distPath.length() - 1); } } String localPath = targetPath + distPath.substring(distPath.lastIndexOf("/")); copy(content.getPath(), new Path(localPath), executable); } }
protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path filePath, FileSystem fs, ArrayList<FileStatus> files) throws IOException { // get the file info and check whether the cached statistics are still valid. final FileStatus file = fs.getFileStatus(filePath); long totalLength = 0; // enumerate all files if (file.isDir()) { totalLength += addFilesInDir(file.getPath(), files, false); } else { files.add(file); testForUnsplittable(file); totalLength += file.getLen(); } // check the modification time stamp long latestModTime = 0; for (FileStatus f : files) { latestModTime = Math.max(f.getModificationTime(), latestModTime); } // check whether the cached statistics are still valid, if we have any if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) { return cachedStats; } // sanity check if (totalLength <= 0) { totalLength = BaseStatistics.SIZE_UNKNOWN; } return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN); }
if (!status.isDir()) { Path filePath = status.getPath(); long modificationTime = status.getModificationTime(); if (!shouldIgnore(filePath, modificationTime)) { files.put(filePath, status); files.putAll(listEligibleFiles(fileSystem, status.getPath()));
@Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { final List<FileStatus> files = this.getFiles(); final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits); for (FileStatus file : files) { final FileSystem fs = file.getPath().getFileSystem(); final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize; for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) { long remainingLength = Math.min(pos + blockSize, length) - pos; // get the block locations and make sure they are in order with respect to their offset final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength); Arrays.sort(blocks); inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength, blocks[0].getHosts())); } } if (inputSplits.size() < minNumSplits) { LOG.warn(String.format( "With the given block size %d, the files %s cannot be split into %d blocks. Filling up with empty splits...", blockSize, Arrays.toString(getFilePaths()), minNumSplits)); FileStatus last = files.get(files.size() - 1); final BlockLocation[] blocks = last.getPath().getFileSystem().getFileBlockLocations(last, 0, last.getLen()); for (int index = files.size(); index < minNumSplits; index++) { inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts())); } } return inputSplits.toArray(new FileInputSplit[inputSplits.size()]); }
if (pathFile.isDir()) { totalLength += addFilesInDir(path, files, true); } else { totalLength += pathFile.getLen(); int splitNum = 0; for (final FileStatus file : files) { final FileSystem fs = file.getPath().getFileSystem(); final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen()); Set<String> hosts = new HashSet<String>(); for(BlockLocation block : blocks) { hosts.addAll(Arrays.asList(block.getHosts())); long len = file.getLen(); if(testForUnsplittable(file)) { len = READ_WHOLE_SPLIT_FLAG; FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()])); inputSplits.add(fis); final FileSystem fs = file.getPath().getFileSystem(); final long len = file.getLen(); final long blockSize = file.getBlockSize(); FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts()); inputSplits.add(fis);
if (dir.isDir()) { if (acceptFile(dir) && enumerateNestedFiles) { length += addFilesInDir(dir.getPath(), files, logExcludedFiles); } else { if (logExcludedFiles && LOG.isDebugEnabled()) { LOG.debug("Directory "+dir.getPath().toString()+" did not pass the file-filter and is excluded."); if(acceptFile(dir)) { files.add(dir); length += dir.getLen(); testForUnsplittable(dir); } else { if (logExcludedFiles && LOG.isDebugEnabled()) { LOG.debug("Directory "+dir.getPath().toString()+" did not pass the file-filter and is excluded.");
protected List<FileStatus> getFiles() throws IOException { // get all the files that are involved in the splits List<FileStatus> files = new ArrayList<>(); for (Path filePath: getFilePaths()) { final FileSystem fs = filePath.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(filePath); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] partials = fs.listStatus(filePath); for (FileStatus partial : partials) { if (!partial.isDir()) { files.add(partial); } } } else { files.add(pathFile); } } return files; }
@Override public BlockLocation[] getFileBlockLocations(FileStatus file, long start, long len) throws IOException { return new BlockLocation[] { new LocalBlockLocation(hostName, file.getLen()) }; }
protected boolean testForUnsplittable(FileStatus pathFile) { if(getInflaterInputStreamFactory(pathFile.getPath()) != null) { unsplittable = true; return true; } return false; }
private List<String> listNewFiles(FileSystem fileSystem) throws IOException { List<String> files = new ArrayList<String>(); FileStatus[] statuses = fileSystem.listStatus(new Path(path)); if (statuses == null) { LOG.warn("Path does not exist: {}", path); } else { for (FileStatus status : statuses) { Path filePath = status.getPath(); String fileName = filePath.getName(); long modificationTime = status.getModificationTime(); if (!isFiltered(fileName, modificationTime)) { files.add(filePath.toString()); modificationTimes.put(fileName, modificationTime); } } } return files; }
partitionDirStatus = null; if (partitionDirStatus == null || !partitionDirStatus.isDir()) { continue; true, true, fileStatus.getModificationTime(), System.currentTimeMillis(), shuffleServiceConfiguration); if (prevFileInfo != null) { if (!prevFileInfo.isReadyToBeConsumed()) { prevFileInfo.setReadyToBeConsumed(fileStatus.getModificationTime());
/** * Creates the input splits to be forwarded to the downstream tasks of the * {@link ContinuousFileReaderOperator}. Splits are sorted <b>by modification time</b> before * being forwarded and only splits belonging to files in the {@code eligibleFiles} * list will be processed. * @param eligibleFiles The files to process. */ private Map<Long, List<TimestampedFileInputSplit>> getInputSplitsSortedByModTime( Map<Path, FileStatus> eligibleFiles) throws IOException { Map<Long, List<TimestampedFileInputSplit>> splitsByModTime = new TreeMap<>(); if (eligibleFiles.isEmpty()) { return splitsByModTime; } for (FileInputSplit split: format.createInputSplits(readerParallelism)) { FileStatus fileStatus = eligibleFiles.get(split.getPath()); if (fileStatus != null) { Long modTime = fileStatus.getModificationTime(); List<TimestampedFileInputSplit> splitsToForward = splitsByModTime.get(modTime); if (splitsToForward == null) { splitsToForward = new ArrayList<>(); splitsByModTime.put(modTime, splitsToForward); } splitsToForward.add(new TimestampedFileInputSplit( modTime, split.getSplitNumber(), split.getPath(), split.getStart(), split.getLength(), split.getHostnames())); } } return splitsByModTime; }
protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path filePath, FileSystem fs, ArrayList<FileStatus> files) throws IOException { // get the file info and check whether the cached statistics are still valid. final FileStatus file = fs.getFileStatus(filePath); long totalLength = 0; // enumerate all files if (file.isDir()) { totalLength += addFilesInDir(file.getPath(), files, false); } else { files.add(file); testForUnsplittable(file); totalLength += file.getLen(); } // check the modification time stamp long latestModTime = 0; for (FileStatus f : files) { latestModTime = Math.max(f.getModificationTime(), latestModTime); } // check whether the cached statistics are still valid, if we have any if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) { return cachedStats; } // sanity check if (totalLength <= 0) { totalLength = BaseStatistics.SIZE_UNKNOWN; } return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN); }
private Map<Path, String> getFileContentByPath(Path directory) throws Exception { Map<Path, String> contents = new HashMap<>(); final FileStatus[] filesInBucket = getFileSystem().listStatus(directory); for (FileStatus file : filesInBucket) { final long fileLength = file.getLen(); byte[] serContents = new byte[(int) fileLength]; FSDataInputStream stream = getFileSystem().open(file.getPath()); stream.read(serContents); contents.put(file.getPath(), new String(serContents, StandardCharsets.UTF_8)); } return contents; }
if (!status.isDir()) { Path filePath = status.getPath(); long modificationTime = status.getModificationTime(); if (!shouldIgnore(filePath, modificationTime)) { files.put(filePath, status); files.putAll(listEligibleFiles(fileSystem, status.getPath()));
if (dir.isDir()) { if (acceptFile(dir) && enumerateNestedFiles) { length += addFilesInDir(dir.getPath(), files, logExcludedFiles); } else { if (logExcludedFiles && LOG.isDebugEnabled()) { LOG.debug("Directory "+dir.getPath().toString()+" did not pass the file-filter and is excluded."); if(acceptFile(dir)) { files.add(dir); length += dir.getLen(); testForUnsplittable(dir); } else { if (logExcludedFiles && LOG.isDebugEnabled()) { LOG.debug("Directory "+dir.getPath().toString()+" did not pass the file-filter and is excluded.");
public VirtualFileServerHandler(Path path) throws IOException { this.path = path; if (!path.isAbsolute()) { throw new IllegalArgumentException("path must be absolute: " + path.toString()); } this.fs = path.getFileSystem(); if (!fs.exists(path) || fs.getFileStatus(path).isDir()) { throw new IllegalArgumentException("no such file: " + path.toString()); } }
@Override public void run(SourceContext<Tuple3<String, Long, Long>> ctx) throws Exception { FileSystem fileSystem = FileSystem.get(new URI(path)); while (isRunning) { List<String> files = listNewFiles(fileSystem); for (String filePath : files) { if (watchType == WatchType.ONLY_NEW_FILES || watchType == WatchType.REPROCESS_WITH_APPENDED) { ctx.collect(new Tuple3<String, Long, Long>(filePath, 0L, -1L)); offsetOfFiles.put(filePath, -1L); } else if (watchType == WatchType.PROCESS_ONLY_APPENDED) { long offset = 0; long fileSize = fileSystem.getFileStatus(new Path(filePath)).getLen(); if (offsetOfFiles.containsKey(filePath)) { offset = offsetOfFiles.get(filePath); } ctx.collect(new Tuple3<String, Long, Long>(filePath, offset, fileSize)); offsetOfFiles.put(filePath, fileSize); LOG.info("File processed: {}, {}, {}", filePath, offset, fileSize); } } Thread.sleep(interval); } }