/** * Create the given dir as well as all missing ancestor dirs. All created dirs will have the given permission. * This should be used instead of {@link FileSystem#mkdirs(Path, FsPermission)}, since that method only sets * the permission for the given dir, and not recursively for the ancestor dirs. * * @param fs FileSystem * @param path The dir to be created * @param perm The permission to be set * @throws IOException if failing to create dir or set permission. */ public static void mkdirsWithRecursivePermission(final FileSystem fs, final Path path, FsPermission perm) throws IOException { mkdirsWithRecursivePermissionWithRetry(fs, path, perm, NO_RETRY_CONFIG); }
/** * Update destination path to put db and table name in format "dbname.tablename" using {@link #getDbTableName(String)} * and include timestamp * * Input dst format: {finaldir}/{schemaName} * Output dst format: {finaldir}/{dbname.tablename}/{currenttimestamp} */ @Override protected void movePath(ParallelRunner parallelRunner, State state, Path src, Path dst, int branchId) throws IOException { String outputDir = dst.getParent().toString(); String schemaName = dst.getName(); Path newDst = new Path(new Path(outputDir, getDbTableName(schemaName)), timestamp); if (!this.publisherFileSystemByBranches.get(branchId).exists(newDst)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), newDst.getParent(), this.permissions.get(branchId), this.retrierConfig); } super.movePath(parallelRunner, state, src, newDst, branchId); }
/** * Make sure directory exists before running {@link BaseDataPublisher#publishData(WorkUnitState, int, boolean, Set)} * so that tables will be moved one at a time rather than all at once */ @Override protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (!this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), this.retrierConfig); } super.publishData(state, branchId, publishSingleTaskData, writerOutputPathsMoved); }
/** * This method needs to be overridden for TimePartitionedDataPublisher, since the output folder structure * contains timestamp, we have to move the files recursively. * * For example, move {writerOutput}/2015/04/08/15/output.avro to {publisherOutput}/2015/04/08/15/output.avro */ @Override protected void addWriterOutputToExistingDir(Path writerOutput, Path publisherOutput, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { for (FileStatus status : FileListUtils.listFilesRecursively(this.writerFileSystemByBranches.get(branchId), writerOutput)) { String filePathStr = status.getPath().toString(); String pathSuffix = filePathStr.substring(filePathStr.indexOf(writerOutput.toString()) + writerOutput.toString().length() + 1); Path outputPath = new Path(publisherOutput, pathSuffix); WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), outputPath.getParent(), this.permissions.get(branchId), this.retrierConfig); movePath(parallelRunner, workUnitState, status.getPath(), outputPath, branchId); } } }
/** * Publish metadata to a set of paths */ private void publishMetadata(String metadataValue, int branchId, Path metadataOutputPath) throws IOException { try { if (metadataOutputPath == null) { LOG.info("Metadata output path not set for branch " + String.valueOf(branchId) + ", not publishing."); return; } if (metadataValue == null) { LOG.info("No metadata collected for branch " + String.valueOf(branchId) + ", not publishing."); return; } FileSystem fs = this.metaDataWriterFileSystemByBranches.get(branchId); if (!fs.exists(metadataOutputPath.getParent())) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(fs, metadataOutputPath, this.permissions.get(branchId), retrierConfig); } //Delete the file if metadata already exists if (fs.exists(metadataOutputPath)) { HadoopUtils.deletePath(fs, metadataOutputPath, false); } LOG.info("Writing metadata for branch " + String.valueOf(branchId) + " to " + metadataOutputPath.toString()); try (FSDataOutputStream outputStream = fs.create(metadataOutputPath)) { outputStream.write(metadataValue.getBytes(StandardCharsets.UTF_8)); } } catch (IOException e) { LOG.error("Metadata file is not generated: " + e, e); } }
protected void addSingleTaskWriterOutputToExistingDir(Path writerOutputDir, Path publisherOutputDir, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { String outputFilePropName = ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.WRITER_FINAL_OUTPUT_FILE_PATHS, this.numBranches, branchId); if (!workUnitState.contains(outputFilePropName)) { LOG.warn("Missing property " + outputFilePropName + ". This task may have pulled no data."); return; } Iterable<String> taskOutputFiles = workUnitState.getPropAsSet(outputFilePropName); for (String taskOutputFile : taskOutputFiles) { Path taskOutputPath = new Path(taskOutputFile); if (!this.writerFileSystemByBranches.get(branchId).exists(taskOutputPath)) { LOG.warn("Task output file " + taskOutputFile + " doesn't exist."); continue; } String pathSuffix = taskOutputFile .substring(taskOutputFile.indexOf(writerOutputDir.toString()) + writerOutputDir.toString().length() + 1); Path publisherOutputPath = new Path(publisherOutputDir, pathSuffix); WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputPath.getParent(), this.permissions.get(branchId), retrierConfig); movePath(parallelRunner, workUnitState, taskOutputPath, publisherOutputPath, branchId); } }
public static void mkdirsWithRecursivePermissionWithRetry(final FileSystem fs, final Path path, FsPermission perm, Config retrierConfig) throws IOException { if (fs.exists(path)) { return; } if (path.getParent() != null && !fs.exists(path.getParent())) { mkdirsWithRecursivePermissionWithRetry(fs, path.getParent(), perm, retrierConfig); } if (!fs.mkdirs(path, perm)) { throw new IOException(String.format("Unable to mkdir %s with permission %s", path, perm)); } if (retrierConfig != NO_RETRY_CONFIG) { //Wait until file is not there as it can happen the file fail to exist right away on eventual consistent fs like Amazon S3 Retryer<Void> retryer = RetryerFactory.newInstance(retrierConfig); try { retryer.call(() -> { if (!fs.exists(path)) { throw new IOException("Path " + path + " does not exist however it should. Will wait more."); } return null; }); } catch (Exception e) { throw new IOException("Path " + path + "does not exist however it should. Giving up..."+ e); } } // Double check permission, since fs.mkdirs() may not guarantee to set the permission correctly if (!fs.getFileStatus(path).getPermission().equals(perm)) { fs.setPermission(path, perm); } }
private void moveTmpPathToOutputPath() throws IOException { Retryer<Void> retryer = RetryerFactory.newInstance(this.retrierConfig); LOG.info(String.format("Moving %s to %s", this.dataset.outputTmpPath(), this.dataset.outputPath())); this.fs.delete(this.dataset.outputPath(), true); if (this.isRetryEnabled) { try { retryer.call(() -> { if (fs.exists(this.dataset.outputPath())) { throw new IOException("Path " + this.dataset.outputPath() + " exists however it should not. Will wait more."); } return null; }); } catch (Exception e) { throw new IOException(e); } } WriterUtils.mkdirsWithRecursivePermissionWithRetry(MRCompactorJobRunner.this.fs, this.dataset.outputPath().getParent(), this.perm, this.retrierConfig); Log.info("Moving from fs: ("+MRCompactorJobRunner.this.tmpFs.getUri()+") path: "+ this.dataset.outputTmpPath() + " to "+ "fs: ("+ FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()).getUri()+") output path: " + this.dataset.outputPath()); HadoopUtils.movePath (MRCompactorJobRunner.this.tmpFs, this.dataset.outputTmpPath(), FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()), this.dataset.outputPath(), false, this.fs.getConf()) ; }
WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), retrierConfig); addSingleTaskWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); } else { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir.getParent(), this.permissions.get(branchId), retrierConfig);
/** * Create the given dir as well as all missing ancestor dirs. All created dirs will have the given permission. * This should be used instead of {@link FileSystem#mkdirs(Path, FsPermission)}, since that method only sets * the permission for the given dir, and not recursively for the ancestor dirs. * * @param fs FileSystem * @param path The dir to be created * @param perm The permission to be set * @throws IOException if failing to create dir or set permission. */ public static void mkdirsWithRecursivePermission(final FileSystem fs, final Path path, FsPermission perm) throws IOException { mkdirsWithRecursivePermissionWithRetry(fs, path, perm, NO_RETRY_CONFIG); }
/** * Make sure directory exists before running {@link BaseDataPublisher#publishData(WorkUnitState, int, boolean, Set)} * so that tables will be moved one at a time rather than all at once */ @Override protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (!this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), this.retrierConfig); } super.publishData(state, branchId, publishSingleTaskData, writerOutputPathsMoved); }
/** * Update destination path to put db and table name in format "dbname.tablename" using {@link #getDbTableName(String)} * and include timestamp * * Input dst format: {finaldir}/{schemaName} * Output dst format: {finaldir}/{dbname.tablename}/{currenttimestamp} */ @Override protected void movePath(ParallelRunner parallelRunner, State state, Path src, Path dst, int branchId) throws IOException { String outputDir = dst.getParent().toString(); String schemaName = dst.getName(); Path newDst = new Path(new Path(outputDir, getDbTableName(schemaName)), timestamp); if (!this.publisherFileSystemByBranches.get(branchId).exists(newDst)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), newDst.getParent(), this.permissions.get(branchId), this.retrierConfig); } super.movePath(parallelRunner, state, src, newDst, branchId); }
/** * This method needs to be overridden for TimePartitionedDataPublisher, since the output folder structure * contains timestamp, we have to move the files recursively. * * For example, move {writerOutput}/2015/04/08/15/output.avro to {publisherOutput}/2015/04/08/15/output.avro */ @Override protected void addWriterOutputToExistingDir(Path writerOutput, Path publisherOutput, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { for (FileStatus status : FileListUtils.listFilesRecursively(this.writerFileSystemByBranches.get(branchId), writerOutput)) { String filePathStr = status.getPath().toString(); String pathSuffix = filePathStr.substring(filePathStr.indexOf(writerOutput.toString()) + writerOutput.toString().length() + 1); Path outputPath = new Path(publisherOutput, pathSuffix); WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), outputPath.getParent(), this.permissions.get(branchId), this.retrierConfig); movePath(parallelRunner, workUnitState, status.getPath(), outputPath, branchId); } } }
/** * Publish metadata to a set of paths */ private void publishMetadata(String metadataValue, int branchId, Path metadataOutputPath) throws IOException { try { if (metadataOutputPath == null) { LOG.info("Metadata output path not set for branch " + String.valueOf(branchId) + ", not publishing."); return; } if (metadataValue == null) { LOG.info("No metadata collected for branch " + String.valueOf(branchId) + ", not publishing."); return; } FileSystem fs = this.metaDataWriterFileSystemByBranches.get(branchId); if (!fs.exists(metadataOutputPath.getParent())) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(fs, metadataOutputPath, this.permissions.get(branchId), retrierConfig); } //Delete the file if metadata already exists if (fs.exists(metadataOutputPath)) { HadoopUtils.deletePath(fs, metadataOutputPath, false); } LOG.info("Writing metadata for branch " + String.valueOf(branchId) + " to " + metadataOutputPath.toString()); try (FSDataOutputStream outputStream = fs.create(metadataOutputPath)) { outputStream.write(metadataValue.getBytes(StandardCharsets.UTF_8)); } } catch (IOException e) { LOG.error("Metadata file is not generated: " + e, e); } }
protected void addSingleTaskWriterOutputToExistingDir(Path writerOutputDir, Path publisherOutputDir, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { String outputFilePropName = ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.WRITER_FINAL_OUTPUT_FILE_PATHS, this.numBranches, branchId); if (!workUnitState.contains(outputFilePropName)) { LOG.warn("Missing property " + outputFilePropName + ". This task may have pulled no data."); return; } Iterable<String> taskOutputFiles = workUnitState.getPropAsSet(outputFilePropName); for (String taskOutputFile : taskOutputFiles) { Path taskOutputPath = new Path(taskOutputFile); if (!this.writerFileSystemByBranches.get(branchId).exists(taskOutputPath)) { LOG.warn("Task output file " + taskOutputFile + " doesn't exist."); continue; } String pathSuffix = taskOutputFile .substring(taskOutputFile.indexOf(writerOutputDir.toString()) + writerOutputDir.toString().length() + 1); Path publisherOutputPath = new Path(publisherOutputDir, pathSuffix); WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputPath.getParent(), this.permissions.get(branchId), retrierConfig); movePath(parallelRunner, workUnitState, taskOutputPath, publisherOutputPath, branchId); } }
public static void mkdirsWithRecursivePermissionWithRetry(final FileSystem fs, final Path path, FsPermission perm, Config retrierConfig) throws IOException { if (fs.exists(path)) { return; } if (path.getParent() != null && !fs.exists(path.getParent())) { mkdirsWithRecursivePermissionWithRetry(fs, path.getParent(), perm, retrierConfig); } if (!fs.mkdirs(path, perm)) { throw new IOException(String.format("Unable to mkdir %s with permission %s", path, perm)); } if (retrierConfig != NO_RETRY_CONFIG) { //Wait until file is not there as it can happen the file fail to exist right away on eventual consistent fs like Amazon S3 Retryer<Void> retryer = RetryerFactory.newInstance(retrierConfig); try { retryer.call(() -> { if (!fs.exists(path)) { throw new IOException("Path " + path + " does not exist however it should. Will wait more."); } return null; }); } catch (Exception e) { throw new IOException("Path " + path + "does not exist however it should. Giving up..."+ e); } } // Double check permission, since fs.mkdirs() may not guarantee to set the permission correctly if (!fs.getFileStatus(path).getPermission().equals(perm)) { fs.setPermission(path, perm); } }
private void moveTmpPathToOutputPath() throws IOException { Retryer<Void> retryer = RetryerFactory.newInstance(this.retrierConfig); LOG.info(String.format("Moving %s to %s", this.dataset.outputTmpPath(), this.dataset.outputPath())); this.fs.delete(this.dataset.outputPath(), true); if (this.isRetryEnabled) { try { retryer.call(() -> { if (fs.exists(this.dataset.outputPath())) { throw new IOException("Path " + this.dataset.outputPath() + " exists however it should not. Will wait more."); } return null; }); } catch (Exception e) { throw new IOException(e); } } WriterUtils.mkdirsWithRecursivePermissionWithRetry(MRCompactorJobRunner.this.fs, this.dataset.outputPath().getParent(), this.perm, this.retrierConfig); Log.info("Moving from fs: ("+MRCompactorJobRunner.this.tmpFs.getUri()+") path: "+ this.dataset.outputTmpPath() + " to "+ "fs: ("+ FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()).getUri()+") output path: " + this.dataset.outputPath()); HadoopUtils.movePath (MRCompactorJobRunner.this.tmpFs, this.dataset.outputTmpPath(), FileSystem.get(this.dataset.outputPath().getParent().toUri(), this.fs.getConf()), this.dataset.outputPath(), false, this.fs.getConf()) ; }
WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), retrierConfig); addSingleTaskWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); } else { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir.getParent(), this.permissions.get(branchId), retrierConfig);