@Override public String getSchema() { return this.workUnitState.getProp(ConfigurationKeys.SOURCE_SCHEMA); }
/** * Builds a State containing all properties added with {@link #addFinalConstructState} * to this {@link gobblin.configuration.WorkUnitState}. All such properties will be stripped of * {@link #FINAL_CONSTRUCT_STATE_PREFIX} but not of any infixes. * * <p> * For example, if state={sample.property: sampleValue} * then * <pre> * {@code * this.addFinalConstructState("infix",state); * this.getFinalConstructState(); * } * </pre> * will return state={infix.sample.property: sampleValue} * </p> * * @return State containing all properties added with {@link #addFinalConstructState}. */ public State getFinalConstructStates() { State constructState = new State(); for (String property : getPropertyNames()) { if (property.startsWith(FINAL_CONSTRUCT_STATE_PREFIX)) { constructState.setProp(property.substring(FINAL_CONSTRUCT_STATE_PREFIX.length()), getProp(property)); } } return constructState; } }
public HiveMetadataForCompactionExtractor(WorkUnitState state, FileSystem fs) throws IOException, TException, HiveException { super(state); if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { log.info("Ignoring Watermark workunit for {}", state.getProp(ConfigurationKeys.DATASET_URN_KEY)); return; } try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { Table table = client.get().getTable(this.dbName, this.tableName); String primaryKeyString = table.getParameters().get(state.getProp(COMPACTION_PRIMARY_KEY)); List<String> primaryKeyList = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(primaryKeyString); String deltaString = table.getParameters().get(state.getProp(COMPACTION_DELTA)); List<String> deltaList = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(deltaString); Path dataFilesPath = new Path(table.getSd().getLocation()); compactionEntity = new MRCompactionEntity(primaryKeyList, deltaList, dataFilesPath, state.getProperties()); } }
public SimpleJsonExtractor(WorkUnitState workUnitState) throws FileSystemException { this.workUnitState = workUnitState; // Resolve the file to pull if (workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_CONN_USE_AUTHENTICATION, false)) { // Add authentication credential if authentication is needed UserAuthenticator auth = new StaticUserAuthenticator(workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_DOMAIN, ""), workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USERNAME), PasswordManager.getInstance(workUnitState) .readPassword(workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_PASSWORD))); FileSystemOptions opts = new FileSystemOptions(); DefaultFileSystemConfigBuilder.getInstance().setUserAuthenticator(opts, auth); this.fileObject = VFS.getManager().resolveFile(workUnitState.getProp(SOURCE_FILE_KEY), opts); } else { this.fileObject = VFS.getManager().resolveFile(workUnitState.getProp(SOURCE_FILE_KEY)); } // Open the file for reading LOGGER.info("Opening file " + this.fileObject.getURL().toString()); this.bufferedReader = this.closer.register(new BufferedReader(new InputStreamReader(this.fileObject.getContent().getInputStream(), ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); }
/** * @deprecated Use {@link #getProp(String)} */ @Deprecated @Override protected String getProperty(String key) { return getProp(key); }
public HiveConvertExtractor(WorkUnitState state, FileSystem fs) throws IOException, TException, HiveException { super(state); if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { log.info("Ignoring Watermark workunit for {}", state.getProp(ConfigurationKeys.DATASET_URN_KEY)); return; } if (!(this.hiveDataset instanceof ConvertibleHiveDataset)) { throw new IllegalStateException("HiveConvertExtractor is only compatible with ConvertibleHiveDataset"); } ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) this.hiveDataset; try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { Table table = client.get().getTable(this.dbName, this.tableName); SchemaAwareHiveTable schemaAwareHiveTable = new SchemaAwareHiveTable(table, AvroSchemaManager.getSchemaFromUrl(this.hiveWorkUnit.getTableSchemaUrl(), fs)); SchemaAwareHivePartition schemaAwareHivePartition = null; if (this.hiveWorkUnit.getPartitionName().isPresent() && this.hiveWorkUnit.getPartitionSchemaUrl().isPresent()) { Partition partition = client.get().getPartition(this.dbName, this.tableName, this.hiveWorkUnit.getPartitionName().get()); schemaAwareHivePartition = new SchemaAwareHivePartition(table, partition, AvroSchemaManager.getSchemaFromUrl(this.hiveWorkUnit.getPartitionSchemaUrl().get(), fs)); } QueryBasedHiveConversionEntity entity = new QueryBasedHiveConversionEntity(convertibleHiveDataset, schemaAwareHiveTable, Optional.fromNullable(schemaAwareHivePartition)); this.conversionEntities.add(entity); } }
/** * @deprecated Use {@link #getProp(String, String)} */ @Deprecated @Override protected String getProperty(String key, String def) { return getProp(key, def); }
/** * Get the actual high {@link Watermark} as a {@link JsonElement}. * * @return a {@link JsonElement} representing the actual high {@link Watermark}, * or {@code null} if the actual high {@link Watermark} is not set. */ public JsonElement getActualHighWatermark() { if (!contains(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY)) { return null; } return JSON_PARSER.parse(getProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY)); }
public TaskState(WorkUnitState workUnitState) { // Since getWorkunit() returns an immutable WorkUnit object, // the WorkUnit object in this object is also immutable. super(workUnitState.getWorkunit(), workUnitState.getJobState(), workUnitState.getTaskBrokerNullable()); addAll(workUnitState); this.jobId = workUnitState.getProp(ConfigurationKeys.JOB_ID_KEY); this.taskId = workUnitState.getProp(ConfigurationKeys.TASK_ID_KEY); this.taskKey = workUnitState.getProp(ConfigurationKeys.TASK_KEY_KEY, "unknown_task_key"); this.taskAttemptId = Optional.fromNullable(workUnitState.getProp(ConfigurationKeys.TASK_ATTEMPT_ID_KEY)); this.setId(this.taskId); }
/** * Get a {@link Map} from dataset URNs (as being specified by {@link ConfigurationKeys#DATASET_URN_KEY} * to the {@link WorkUnitState} with the dataset URNs. * * <p> * {@link WorkUnitState}s that do not have {@link ConfigurationKeys#DATASET_URN_KEY} set will be added * to the dataset state belonging to {@link ConfigurationKeys#DEFAULT_DATASET_URN}. * </p> * * @return a {@link Map} from dataset URNs to the {@link WorkUnitState} with the dataset URNs */ public Map<String, Iterable<WorkUnitState>> getPreviousWorkUnitStatesByDatasetUrns() { Map<String, Iterable<WorkUnitState>> previousWorkUnitStatesByDatasetUrns = Maps.newHashMap(); for (WorkUnitState workUnitState : this.previousWorkUnitStates) { String datasetUrn = workUnitState.getProp(ConfigurationKeys.DATASET_URN_KEY, ConfigurationKeys.DEFAULT_DATASET_URN); if (!previousWorkUnitStatesByDatasetUrns.containsKey(datasetUrn)) { previousWorkUnitStatesByDatasetUrns.put(datasetUrn, Lists.<WorkUnitState> newArrayList()); } ((List<WorkUnitState>) previousWorkUnitStatesByDatasetUrns.get(datasetUrn)).add(workUnitState); } return ImmutableMap.copyOf(previousWorkUnitStatesByDatasetUrns); }
@Override public Extractor getExtractor(WorkUnitState state) throws IOException { try { return classAliasResolver.resolveClass(state.getProp(HIVE_SOURCE_EXTRACTOR_TYPE, DEFAULT_HIVE_SOURCE_EXTRACTOR_TYPE)) .newInstance().createExtractor(state, getSourceFs(state)); } catch (Exception e) { throw new IOException(e); } }
/*** * Get Hive view registration whitelist blacklist from Workunit state * @param workUnit Workunit containing view whitelist blacklist property * @return Optional WhitelistBlacklist if Workunit contains it */ @VisibleForTesting public static Optional<WhitelistBlacklist> getViewWhiteBackListFromWorkUnit(WorkUnitState workUnit) { Optional<WhitelistBlacklist> optionalViewWhiteBlacklist = Optional.absent(); if (workUnit == null) { return optionalViewWhiteBlacklist; } if (workUnit.contains(HIVE_CONVERSION_VIEW_REGISTRATION_WHITELIST) || workUnit.contains(HIVE_CONVERSION_VIEW_REGISTRATION_BLACKLIST)) { String viewWhiteList = workUnit.getProp(HIVE_CONVERSION_VIEW_REGISTRATION_WHITELIST, StringUtils.EMPTY); String viewBlackList = workUnit.getProp(HIVE_CONVERSION_VIEW_REGISTRATION_BLACKLIST, StringUtils.EMPTY); try { optionalViewWhiteBlacklist = Optional.of(new WhitelistBlacklist(viewWhiteList, viewBlackList)); } catch (IOException e) { Throwables.propagate(e); } } return optionalViewWhiteBlacklist; }
private long createLowWatermarkForBootstrap(WorkUnitState state) throws IOException { String bootstrapPeriodString = state.getProp(BOOTSTRAP_PERIOD, DEFAULT_BOOTSTRAP_PERIOD); Period period = Period.parse(bootstrapPeriodString); DateTime startTime = DateTime.now().minus(period); try { Queue<JsonElement> firstRevision = retrievePageRevisions(ImmutableMap.<String, String>builder().putAll(this.baseQuery) .put("rvprop", "ids") .put("titles", this.requestedTitle) .put("rvlimit", "1") .put("rvstart", WIKIPEDIA_TIMESTAMP_FORMAT.print(startTime)) .put("rvdir", "newer") .build()); if (firstRevision.isEmpty()) { throw new IOException("Could not retrieve oldest revision, returned empty revisions list."); } return parseRevision(firstRevision.poll()); } catch (URISyntaxException use) { throw new IOException(use); } }
/** * Get the current runtime state of the {@link WorkUnit}. * * @return {@link WorkingState} of the {@link WorkUnit} */ public WorkingState getWorkingState() { return WorkingState .valueOf(getProp(ConfigurationKeys.WORK_UNIT_WORKING_STATE_KEY, WorkingState.PENDING.toString())); }
@Override public void publishData(Collection<? extends WorkUnitState> states) throws IOException { super.publishData(states); for (WorkUnitState state : states) { if (state.getWorkingState() == WorkingState.COMMITTED) { try { deleteFilesOnSource(state); } catch (Throwable t) { log.warn( String.format("Failed to delete one or more files on source in %s", state.getProp(CopySource.SERIALIZED_COPYABLE_FILE)), t); } } else { log.info(String.format("Not deleting files %s on source fileSystem as the workunit state is %s.", state.getProp(CopySource.SERIALIZED_COPYABLE_FILE), state.getWorkingState())); } } }
@VisibleForTesting public void preservePartitionParams(Collection<? extends WorkUnitState> states) { for (WorkUnitState wus : states) { if (wus.getWorkingState() != WorkingState.COMMITTED) { continue; } if (!wus.contains(COMPLETE_SOURCE_PARTITION_NAME)) { continue; } if (!wus.contains(COMPLETE_DEST_PARTITION_NAME)) { continue; } if (!(wus.contains(PARTITION_PARAMETERS_WHITELIST) || wus.contains(PARTITION_PARAMETERS_BLACKLIST))) { continue; } List<String> whitelist = COMMA_SPLITTER.splitToList(wus.getProp(PARTITION_PARAMETERS_WHITELIST, StringUtils.EMPTY)); List<String> blacklist = COMMA_SPLITTER.splitToList(wus.getProp(PARTITION_PARAMETERS_BLACKLIST, StringUtils.EMPTY)); String completeSourcePartitionName = wus.getProp(COMPLETE_SOURCE_PARTITION_NAME); String completeDestPartitionName = wus.getProp(COMPLETE_DEST_PARTITION_NAME); if (!copyPartitionParams(completeSourcePartitionName, completeDestPartitionName, whitelist, blacklist)) { log.warn("Unable to copy partition parameters from " + completeSourcePartitionName + " to " + completeDestPartitionName); } } }
private String readProp(String key, WorkUnitState workUnitState) { String value = workUnitState.getWorkunit().getProp(key); if (StringUtils.isBlank(value)) { value = workUnitState.getProp(key); } if (StringUtils.isBlank(value)) { value = workUnitState.getJobState().getProp(key); } return value; }
/** * Create a {@link Multimap} that maps a {@link CopyableDataset} to all {@link WorkUnitState}s that belong to this * {@link CopyableDataset}. This mapping is used to set WorkingState of all {@link WorkUnitState}s to * {@link WorkUnitState.WorkingState#COMMITTED} after a {@link CopyableDataset} is successfully published. */ private static Multimap<CopyEntity.DatasetAndPartition, WorkUnitState> groupByFileSet( Collection<? extends WorkUnitState> states) { Multimap<CopyEntity.DatasetAndPartition, WorkUnitState> datasetRoots = ArrayListMultimap.create(); for (WorkUnitState workUnitState : states) { CopyEntity file = CopySource.deserializeCopyEntity(workUnitState); CopyEntity.DatasetAndPartition datasetAndPartition = file.getDatasetAndPartition( CopyableDatasetMetadata.deserialize(workUnitState.getProp(CopySource.SERIALIZED_COPYABLE_DATASET))); datasetRoots.put(datasetAndPartition, workUnitState); } return datasetRoots; }
@Override public Converter<String, String, FileAwareInputStream, FileAwareInputStream> init(WorkUnitState workUnit) { Map<String, Object> config = EncryptionConfigParser.getConfigForBranch(EncryptionConfigParser.EntityType.CONVERTER, workUnit); if (config == null) { // Backwards compatibility check: if no config was passed in via the standard config, revert back to GPG // with the passphrase in DECRYPTION_PASSPHRASE_KEY. log.info("Assuming GPG decryption since no other config parameters are set"); config = Maps.newHashMap(); config.put(EncryptionConfigParser.ENCRYPTION_ALGORITHM_KEY, DEFAULT_ALGORITHM); Preconditions.checkArgument(workUnit.contains(DECRYPTION_PASSPHRASE_KEY), "Passphrase is required while using DecryptConverter. Please specify " + DECRYPTION_PASSPHRASE_KEY); String passphrase = PasswordManager.getInstance(workUnit).readPassword(workUnit.getProp(DECRYPTION_PASSPHRASE_KEY)); config.put(EncryptionConfigParser.ENCRYPTION_KEYSTORE_PASSWORD_KEY, passphrase); } decryptor = EncryptionFactory.buildStreamCryptoProvider(config); return super.init(workUnit); }
/** * Submit an sla event when a {@link gobblin.data.management.copy.CopyableFile} is published. The <code>workUnitState</code> passed should have the * required {@link SlaEventKeys} set. * * @see SlaEventSubmitter#submit() * * @param eventSubmitter * @param workUnitState */ static void submitSuccessfulFilePublish(EventSubmitter eventSubmitter, CopyableFile cf, WorkUnitState workUnitState) { String datasetUrn = workUnitState.getProp(SlaEventKeys.DATASET_URN_KEY); String partition = workUnitState.getProp(SlaEventKeys.PARTITION_KEY); String completenessPercentage = workUnitState.getProp(SlaEventKeys.COMPLETENESS_PERCENTAGE_KEY); String recordCount = workUnitState.getProp(SlaEventKeys.RECORD_COUNT_KEY); String previousPublishTimestamp = workUnitState.getProp(SlaEventKeys.PREVIOUS_PUBLISH_TS_IN_MILLI_SECS_KEY); String dedupeStatus = workUnitState.getProp(SlaEventKeys.DEDUPE_STATUS_KEY); SlaEventSubmitter.builder().eventSubmitter(eventSubmitter).eventName(FILE_PUBLISHED_EVENT_NAME) .datasetUrn(datasetUrn).partition(partition).originTimestamp(Long.toString(cf.getOriginTimestamp())) .upstreamTimestamp(Long.toString(cf.getUpstreamTimestamp())).completenessPercentage(completenessPercentage) .recordCount(recordCount).previousPublishTimestamp(previousPublishTimestamp).dedupeStatus(dedupeStatus) .additionalMetadata(TARGET_PATH, cf.getDestination().toString()) .additionalMetadata(SOURCE_PATH, cf.getOrigin().getPath().toString()) .additionalMetadata(SIZE_IN_BYTES, Long.toString(cf.getOrigin().getLen())).build().submit(); } }