private void moveDirectory(String sourceDir, String targetDir) throws IOException { // If targetDir exists, delete it if (this.fs.exists(new Path(targetDir))) { deleteDirectory(targetDir); } // Create parent directories of targetDir WriterUtils.mkdirsWithRecursivePermission(this.fs, new Path(targetDir).getParent(), FsPermission.getCachePoolDefault()); // Move directory log.info("Moving directory: " + sourceDir + " to: " + targetDir); if (!this.fs.rename(new Path(sourceDir), new Path(targetDir))) { throw new IOException(String.format("Unable to move %s to %s", sourceDir, targetDir)); } }
public static FileSystem getWriterFS(State state, int numBranches, int branchId) throws IOException { URI uri = URI.create(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI)); Configuration hadoopConf = getFsConfiguration(state); if (state.getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { // Initialize file system for a proxy user. String authMethod = state.getProp(ConfigurationKeys.FS_PROXY_AUTH_METHOD, ConfigurationKeys.DEFAULT_FS_PROXY_AUTH_METHOD); if (authMethod.equalsIgnoreCase(ConfigurationKeys.TOKEN_AUTH)) { return getWriterFsUsingToken(state, uri); } else if (authMethod.equalsIgnoreCase(ConfigurationKeys.KERBEROS_AUTH)) { return getWriterFsUsingKeytab(state, uri); } } // Initialize file system as the current user. return FileSystem.get(uri, hadoopConf); }
public static FileSystem getWriterFs(State state) throws IOException { return getWriterFS(state, 1, 0); }
/** * Cleanup staging data of a Gobblin task. * * @param state a {@link State} instance storing task configuration properties * @param logger a {@link Logger} used for logging */ public static void cleanTaskStagingData(State state, Logger logger) throws IOException { int numBranches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1); for (int branchId = 0; branchId < numBranches; branchId++) { String writerFsUri = state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state)); Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId); if (fs.exists(stagingPath)) { logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath()); if (!fs.delete(stagingPath, true)) { throw new IOException("Clean up staging directory " + stagingPath.toUri().getPath() + " failed"); } } Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId); if (fs.exists(outputPath)) { logger.info("Cleaning up output directory " + outputPath.toUri().getPath()); if (!fs.delete(outputPath, true)) { throw new IOException("Clean up output directory " + outputPath.toUri().getPath() + " failed"); } } } }
public FileAwareInputStreamDataWriter(State state, int numBranches, int branchId, String writerAttemptId) throws IOException { super(state); if (numBranches > 1) { throw new IOException("Distcp can only operate with one branch."); } if (!(state instanceof WorkUnitState)) { throw new RuntimeException(String.format("Distcp requires a %s on construction.", WorkUnitState.class.getSimpleName())); } this.state = (WorkUnitState) state; this.taskBroker = this.state.getTaskBroker(); this.writerAttemptIdOptional = Optional.fromNullable(writerAttemptId); String uri = this.state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); this.fs = FileSystem.get(URI.create(uri), WriterUtils.getFsConfiguration(state)); this.stagingDir = this.writerAttemptIdOptional.isPresent() ? WriterUtils .getWriterStagingDir(state, numBranches, branchId, this.writerAttemptIdOptional.get()) : WriterUtils.getWriterStagingDir(state, numBranches, branchId); this.outputDir = getOutputDir(state); this.copyableDatasetMetadata = CopyableDatasetMetadata.deserialize(state.getProp(CopySource.SERIALIZED_COPYABLE_DATASET)); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.actualProcessedCopyableFile = Optional.absent(); this.copySpeedMeter = getMetricContext().meter(GOBBLIN_COPY_BYTES_COPIED_METER); this.bufferSize = state.getPropAsInt(CopyConfiguration.BUFFER_SIZE, StreamCopier.DEFAULT_BUFFER_SIZE); this.encryptionConfig = EncryptionConfigParser .getConfigForBranch(EncryptionConfigParser.EntityType.WRITER, this.state, numBranches, branchId); }
/** * Get the {@link Path} corresponding the the relative file path for a given {@link gobblin.writer.DataWriter}. * This method retrieves the value of {@link ConfigurationKeys#WRITER_FILE_PATH} from the given {@link State}. It also * constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_PATH} if not is not specified in the given * {@link State}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the relative directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterFilePath(State state, int numBranches, int branchId) { if (state.contains( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId))) { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId))); } switch (getWriterFilePathType(state)) { case TABLENAME: return WriterUtils.getTableNameWriterFilePath(state); default: return WriterUtils.getDefaultWriterFilePath(state, numBranches, branchId); } }
FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state));
/** * Get the staging {@link Path} for {@link gobblin.writer.DataWriter} that has attemptId in the path. */ public static Path getWriterStagingDir(State state, int numBranches, int branchId, String attemptId) { Preconditions.checkArgument(attemptId != null && !attemptId.isEmpty(), "AttemptId cannot be null or empty: " + attemptId); return new Path(getWriterStagingDir(state, numBranches, branchId), attemptId); }
/** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing * its output data. The output data directory is determined by combining the * {@link ConfigurationKeys#WRITER_OUTPUT_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterOutputDir(State state, int numBranches, int branchId) { String writerOutputDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(writerOutputDirKey), "Missing required property " + writerOutputDirKey); return new Path(state.getProp(writerOutputDirKey), WriterUtils.getWriterFilePath(state, numBranches, branchId)); }
ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state)); ParallelRunner parallelRunner = getParallelRunner(fs, closer, parallelRunnerThreads, parallelRunners); Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId); if (fs.exists(stagingPath)) { logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath()); Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId); if (fs.exists(outputPath)) { logger.info("Cleaning up output directory " + outputPath.toUri().getPath());
/** * Build a new {@link CopyDataPublisher} from {@link State}. The constructor expects the following to be set in the * {@link State}, * <ul> * <li>{@link ConfigurationKeys#WRITER_OUTPUT_DIR} * <li>{@link ConfigurationKeys#WRITER_FILE_SYSTEM_URI} * </ul> * */ public CopyDataPublisher(State state) throws IOException { super(state); String uri = this.state.getProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, ConfigurationKeys.LOCAL_FS_URI); this.fs = FileSystem.get(URI.create(uri), WriterUtils.getFsConfiguration(state)); FileAwareInputStreamDataWriterBuilder.setJobSpecificOutputPaths(state); this.writerOutputDir = new Path(state.getProp(ConfigurationKeys.WRITER_OUTPUT_DIR)); MetricContext metricContext = Instrumented.getMetricContext(state, CopyDataPublisher.class, GobblinMetrics.getCustomTagsFromState(state)); this.eventSubmitter = new EventSubmitter.Builder(metricContext, "gobblin.copy.CopyDataPublisher").build(); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.recoveryHelper.purgeOldPersistedFile(); }
/** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing * its staging data. The staging data directory is determined by combining the * {@link ConfigurationKeys#WRITER_STAGING_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterStagingDir(State state, int numBranches, int branchId) { String writerStagingDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(writerStagingDirKey), "Missing required property " + writerStagingDirKey); return new Path( state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId)), WriterUtils.getWriterFilePath(state, numBranches, branchId)); }
/** * Create the given dir as well as all missing ancestor dirs. All created dirs will have the given permission. * This should be used instead of {@link FileSystem#mkdirs(Path, FsPermission)}, since that method only sets * the permission for the given dir, and not recursively for the ancestor dirs. * * @param fs FileSystem * @param path The dir to be created * @param perm The permission to be set * @throws IOException if failing to create dir or set permission. */ public static void mkdirsWithRecursivePermission(FileSystem fs, Path path, FsPermission perm) throws IOException { if (fs.exists(path)) { return; } if (path.getParent() != null && !fs.exists(path.getParent())) { mkdirsWithRecursivePermission(fs, path.getParent(), perm); } if (!fs.mkdirs(path, perm)) { throw new IOException(String.format("Unable to mkdir %s with permission %s", path, perm)); } // Double check permission, since fs.mkdirs() may not guarantee to set the permission correctly if (!fs.getFileStatus(path).getPermission().equals(perm)) { fs.setPermission(path, perm); } }
private static FileSystem getTargetFileSystem(State state) throws IOException { return HadoopUtils.getOptionallyThrottledFileSystem(WriterUtils.getWriterFS(state, 1, 0), state); }
/** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.publisher.BaseDataPublisher} should * commits its output data. The final output data directory is determined by combining the * {@link ConfigurationKeys#DATA_PUBLISHER_FINAL_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.publisher.BaseDataPublisher} will publish. * @return a {@link Path} specifying the directory where the {@link gobblin.publisher.BaseDataPublisher} will publish. */ public static Path getDataPublisherFinalDir(State state, int numBranches, int branchId) { String dataPublisherFinalDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(dataPublisherFinalDirKey), "Missing required property " + dataPublisherFinalDirKey); if (state.getPropAsBoolean(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, ConfigurationKeys.DEFAULT_DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR)) { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId)), WriterUtils.getWriterFilePath(state, numBranches, branchId)); } else { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId))); } }
private void moveTmpPathToOutputPath() throws IOException { LOG.info(String.format("Moving %s to %s", this.dataset.outputTmpPath(), this.dataset.outputPath())); this.fs.delete(this.dataset.outputPath(), true); WriterUtils.mkdirsWithRecursivePermission(this.fs, this.dataset.outputPath().getParent(), this.perm); if (!this.fs.rename(this.dataset.outputTmpPath(), this.dataset.outputPath())) { throw new IOException( String.format("Unable to move %s to %s", this.dataset.outputTmpPath(), this.dataset.outputPath())); } }
MRCompactorJobRunner.COMPACTION_JOB_OUTPUT_DIR_PERMISSION, FsPermission.getDefault()); WriterUtils.mkdirsWithRecursivePermission(this.fs, dstPath, permission); FsPermission.getDefault()); WriterUtils.mkdirsWithRecursivePermission(this.fs, dstPath.getParent(), permission); if (!this.fs.rename(tmpPath, dstPath)) { throw new IOException(