/** * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to * use when creating files. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}. */ public static void serializeWriterDirPermissions(State state, int numBranches, int branchId, FsPermission fsPermissions) { serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), fsPermissions); }
/** * Given a {@link FsPermission} objects, set a key, value pair in the given {@link State} for the writer to * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}. */ public static void serializeWriterFilePermissions(State state, int numBranches, int branchId, FsPermission fsPermissions) { serializeFsPermissions(state, ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), fsPermissions); }
/** * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to * use when creating directories. This method should be used in conjunction with {@link #deserializeWriterDirPermissions(State, int, int)}. */ public static void setWriterDirOctalPermissions(State state, int numBranches, int branchId, String octalPermissions) { state.setProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), octalPermissions); }
/** * Given a {@link String} in octal notation, set a key, value pair in the given {@link State} for the writer to * use when creating files. This method should be used in conjunction with {@link #deserializeWriterFilePermissions(State, int, int)}. */ public static void setWriterFileOctalPermissions(State state, int numBranches, int branchId, String octalPermissions) { state.setProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), octalPermissions); }
public static Path getOutputDir(State state) { return new Path( state.getProp(ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, 1, 0))); }
/** * Get the value of {@link ConfigurationKeys#WRITER_FILE_NAME} for the a given {@link gobblin.writer.DataWriter}. The * method also constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_NAME} if it is not set in the * {@link State} * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to. * @param writerId is the id for a specific {@link gobblin.writer.DataWriter}. * @param formatExtension is the format extension for the file (e.g. ".avro"). * @return a {@link String} representation of the file name. */ public static String getWriterFileName(State state, int numBranches, int branchId, String writerId, String formatExtension) { String defaultFileName = Strings.isNullOrEmpty(formatExtension) ? String.format("%s.%s", ConfigurationKeys.DEFAULT_WRITER_FILE_BASE_NAME, writerId) : String.format("%s.%s.%s", ConfigurationKeys.DEFAULT_WRITER_FILE_BASE_NAME, writerId, formatExtension); return state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_NAME, numBranches, branchId), defaultFileName); }
/** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing * its staging data. The staging data directory is determined by combining the * {@link ConfigurationKeys#WRITER_STAGING_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterStagingDir(State state, int numBranches, int branchId) { String writerStagingDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(writerStagingDirKey), "Missing required property " + writerStagingDirKey); return new Path( state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_STAGING_DIR, numBranches, branchId)), WriterUtils.getWriterFilePath(state, numBranches, branchId)); }
/** * Get a {@link DataWriterBuilder} for building a {@link gobblin.writer.DataWriter}. * * @param branches number of forked branches * @param index branch index * @return a {@link DataWriterBuilder} */ public DataWriterBuilder getDataWriterBuilder(int branches, int index) { String writerBuilderPropertyName = ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.WRITER_BUILDER_CLASS, branches, index); log.debug("Using property {} to get a writer builder for branches:{}, index:{}", writerBuilderPropertyName, branches, index); String dataWriterBuilderClassName = this.taskState.getProp(writerBuilderPropertyName, null); if (dataWriterBuilderClassName == null) { dataWriterBuilderClassName = ConfigurationKeys.DEFAULT_WRITER_BUILDER_CLASS; log.info("No configured writer builder found, using {} as the default builder", dataWriterBuilderClassName); } else { log.info("Found configured writer builder as {}", dataWriterBuilderClassName); } try { return DataWriterBuilder.class.cast(Class.forName(dataWriterBuilderClassName).newInstance()); } catch (ClassNotFoundException cnfe) { throw new RuntimeException(cnfe); } catch (InstantiationException ie) { throw new RuntimeException(ie); } catch (IllegalAccessException iae) { throw new RuntimeException(iae); } }
/** * Get the output format of the writer of type {@link WriterOutputFormat}. * * @param branches number of forked branches * @param index branch index * @return output format of the writer */ public WriterOutputFormat getWriterOutputFormat(int branches, int index) { String writerOutputFormatValue = this.taskState.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_FORMAT_KEY, branches, index), WriterOutputFormat.OTHER.name()); log.debug("Found writer output format value = {}", writerOutputFormatValue); WriterOutputFormat wof = Enums.getIfPresent(WriterOutputFormat.class, writerOutputFormatValue.toUpperCase()) .or(WriterOutputFormat.OTHER); log.debug("Returning writer output format = {}", wof); return wof; }
@Override public void close() throws IOException { // Tell this fork that the parent task is done. This is a second chance call if the parent // task failed and didn't do so through the normal way of calling markParentTaskDone(). this.parentTaskDone = true; // Record the fork state into the task state that will be persisted into the state store this.taskState.setProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.FORK_STATE_KEY, this.branches, this.index), this.forkState.get().name()); try { this.closer.close(); } finally { if (this.writer.isPresent()) { this.writer.get().cleanup(); } } }
/** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.publisher.BaseDataPublisher} should * commits its output data. The final output data directory is determined by combining the * {@link ConfigurationKeys#DATA_PUBLISHER_FINAL_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.publisher.BaseDataPublisher} will publish. * @return a {@link Path} specifying the directory where the {@link gobblin.publisher.BaseDataPublisher} will publish. */ public static Path getDataPublisherFinalDir(State state, int numBranches, int branchId) { String dataPublisherFinalDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(dataPublisherFinalDirKey), "Missing required property " + dataPublisherFinalDirKey); if (state.getPropAsBoolean(ConfigurationKeys.DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR, ConfigurationKeys.DEFAULT_DATA_PUBLISHER_APPEND_EXTRACT_TO_FINAL_DIR)) { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId)), WriterUtils.getWriterFilePath(state, numBranches, branchId)); } else { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, numBranches, branchId))); } }
/** * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is writing a file. */ public static FsPermission deserializeWriterFilePermissions(State state, int numBranches, int branchId) { return new FsPermission(state.getPropAsShortWithRadix( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PERMISSIONS, numBranches, branchId), FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX)); }
/** * Get the {@link Path} corresponding the to the directory a given {@link gobblin.writer.DataWriter} should be writing * its output data. The output data directory is determined by combining the * {@link ConfigurationKeys#WRITER_OUTPUT_DIR} and the {@link ConfigurationKeys#WRITER_FILE_PATH}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterOutputDir(State state, int numBranches, int branchId) { String writerOutputDirKey = ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_OUTPUT_DIR, numBranches, branchId); Preconditions.checkArgument(state.contains(writerOutputDirKey), "Missing required property " + writerOutputDirKey); return new Path(state.getProp(writerOutputDirKey), WriterUtils.getWriterFilePath(state, numBranches, branchId)); }
/** * Deserializes a {@link FsPermission}s object that should be used when a {@link DataWriter} is creating directories. */ public static FsPermission deserializeWriterDirPermissions(State state, int numBranches, int branchId) { return new FsPermission(state.getPropAsShortWithRadix( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DIR_PERMISSIONS, numBranches, branchId), FsPermission.getDefault().toShort(), ConfigurationKeys.PERMISSION_PARSING_RADIX)); }
/** * Cleanup staging data of a Gobblin task. * * @param state a {@link State} instance storing task configuration properties * @param logger a {@link Logger} used for logging */ public static void cleanTaskStagingData(State state, Logger logger) throws IOException { int numBranches = state.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1); for (int branchId = 0; branchId < numBranches; branchId++) { String writerFsUri = state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state)); Path stagingPath = WriterUtils.getWriterStagingDir(state, numBranches, branchId); if (fs.exists(stagingPath)) { logger.info("Cleaning up staging directory " + stagingPath.toUri().getPath()); if (!fs.delete(stagingPath, true)) { throw new IOException("Clean up staging directory " + stagingPath.toUri().getPath() + " failed"); } } Path outputPath = WriterUtils.getWriterOutputDir(state, numBranches, branchId); if (fs.exists(outputPath)) { logger.info("Cleaning up output directory " + outputPath.toUri().getPath()); if (!fs.delete(outputPath, true)) { throw new IOException("Clean up output directory " + outputPath.toUri().getPath() + " failed"); } } } }
/** * Get the writer {@link Destination.DestinationType}. * * @param branches number of forked branches * @param index branch index * @return writer {@link Destination.DestinationType} */ public Destination.DestinationType getDestinationType(int branches, int index) { return Destination.DestinationType.valueOf(this.taskState.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_DESTINATION_TYPE_KEY, branches, index), Destination.DestinationType.HDFS.name())); }
/** * Get the {@link Path} corresponding the the relative file path for a given {@link gobblin.writer.DataWriter}. * This method retrieves the value of {@link ConfigurationKeys#WRITER_FILE_PATH} from the given {@link State}. It also * constructs the default value of the {@link ConfigurationKeys#WRITER_FILE_PATH} if not is not specified in the given * {@link State}. * @param state is the {@link State} corresponding to a specific {@link gobblin.writer.DataWriter}. * @param numBranches is the total number of branches for the given {@link State}. * @param branchId is the id for the specific branch that the {{@link gobblin.writer.DataWriter} will write to. * @return a {@link Path} specifying the relative directory where the {@link gobblin.writer.DataWriter} will write to. */ public static Path getWriterFilePath(State state, int numBranches, int branchId) { if (state.contains( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId))) { return new Path(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_PATH, numBranches, branchId))); } switch (getWriterFilePathType(state)) { case TABLENAME: return WriterUtils.getTableNameWriterFilePath(state); default: return WriterUtils.getDefaultWriterFilePath(state, numBranches, branchId); } }
public static FileSystem getWriterFS(State state, int numBranches, int branchId) throws IOException { URI uri = URI.create(state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI)); Configuration hadoopConf = getFsConfiguration(state); if (state.getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { // Initialize file system for a proxy user. String authMethod = state.getProp(ConfigurationKeys.FS_PROXY_AUTH_METHOD, ConfigurationKeys.DEFAULT_FS_PROXY_AUTH_METHOD); if (authMethod.equalsIgnoreCase(ConfigurationKeys.TOKEN_AUTH)) { return getWriterFsUsingToken(state, uri); } else if (authMethod.equalsIgnoreCase(ConfigurationKeys.KERBEROS_AUTH)) { return getWriterFsUsingKeytab(state, uri); } } // Initialize file system as the current user. return FileSystem.get(uri, hadoopConf); }
ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); FileSystem fs = getFsWithProxy(state, writerFsUri, WriterUtils.getFsConfiguration(state));
public FileAwareInputStreamDataWriter(State state, int numBranches, int branchId, String writerAttemptId) throws IOException { super(state); if (numBranches > 1) { throw new IOException("Distcp can only operate with one branch."); } if (!(state instanceof WorkUnitState)) { throw new RuntimeException(String.format("Distcp requires a %s on construction.", WorkUnitState.class.getSimpleName())); } this.state = (WorkUnitState) state; this.taskBroker = this.state.getTaskBroker(); this.writerAttemptIdOptional = Optional.fromNullable(writerAttemptId); String uri = this.state.getProp( ForkOperatorUtils.getPropertyNameForBranch(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, numBranches, branchId), ConfigurationKeys.LOCAL_FS_URI); this.fs = FileSystem.get(URI.create(uri), WriterUtils.getFsConfiguration(state)); this.stagingDir = this.writerAttemptIdOptional.isPresent() ? WriterUtils .getWriterStagingDir(state, numBranches, branchId, this.writerAttemptIdOptional.get()) : WriterUtils.getWriterStagingDir(state, numBranches, branchId); this.outputDir = getOutputDir(state); this.copyableDatasetMetadata = CopyableDatasetMetadata.deserialize(state.getProp(CopySource.SERIALIZED_COPYABLE_DATASET)); this.recoveryHelper = new RecoveryHelper(this.fs, state); this.actualProcessedCopyableFile = Optional.absent(); this.copySpeedMeter = getMetricContext().meter(GOBBLIN_COPY_BYTES_COPIED_METER); this.bufferSize = state.getPropAsInt(CopyConfiguration.BUFFER_SIZE, StreamCopier.DEFAULT_BUFFER_SIZE); this.encryptionConfig = EncryptionConfigParser .getConfigForBranch(EncryptionConfigParser.EntityType.WRITER, this.state, numBranches, branchId); }