org.apache.gobblin.configuration.SourceState.getProp java code examples

/**
 * true if water mark columns and water mark type provided
 *
 * @return true if water mark exists
 */
private boolean isWatermarkExists() {
 if (!Strings.isNullOrEmpty(this.state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY)) && !Strings
   .isNullOrEmpty(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) {
  return true;
 }
 return false;
}

@Override
public void init(SourceState state) {
 String regexPattern = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN);
 Preconditions.checkNotNull(regexPattern, "Must specify a regex pattern in " +
  PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN
 );
 this.leadTime = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state);
 this.pattern = Pattern.compile(regexPattern);
 this.helper = new HadoopFsHelper(state);
 this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY));
 this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR,
   ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR);
 this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME,
   ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : "";
}

@VisibleForTesting
public void initialize(SourceState state) throws IOException {
 this.updateProvider = UpdateProviderFactory.create(state);
 this.metricContext = Instrumented.getMetricContext(state, HiveSource.class);
 this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build();
 this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state);
 this.workunits = Lists.newArrayList();
 this.watermarker =
   GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class,
     state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS))
     .createFromState(state);
 EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT);
 this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class,
   state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(),
   this.eventSubmitter);
 int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS);
 this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis();
 this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY,
   DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER));
 silenceHiveLoggers();
}

@Override
public WorkUnitStream getWorkunitStream(SourceState state) {
 int numTasks = state.getPropAsInt(NUM_TASKS_KEY);
 String eventBusId = state.getProp(EventBusPublishingTaskFactory.EVENTBUS_ID_KEY);
 EventBus eventBus = TestingEventBuses.getEventBus(eventBusId);
 return new BasicWorkUnitStream.Builder(new WorkUnitIterator(eventBus, eventBusId, numTasks)).build();
}

/**
 * If full dump is true, the low watermark will be based on {@link ConfigurationKeys#SOURCE_QUERYBASED_START_VALUE}
 * Otherwise it will base on the previous watermark. Please refer to {@link Partitioner#getLowWatermark(ExtractType, WatermarkType, long, int)}
 * @return full dump or not
 */
public boolean isFullDump() {
 return Boolean.valueOf(this.state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY));
}

@Override
public void init(SourceState state) {
 DateTimeZone.setDefault(DateTimeZone
   .forID(state.getProp(ConfigurationKeys.SOURCE_TIMEZONE, ConfigurationKeys.DEFAULT_SOURCE_TIMEZONE)));
 initDatePartition(state);
 this.sourcePartitionPrefix =
   state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PREFIX, StringUtils.EMPTY);
 this.sourcePartitionSuffix =
   state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_SUFFIX, StringUtils.EMPTY);
 this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY));
 this.leadTimeDuration = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state);
 this.helper = new HadoopFsHelper(state);
 this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR,
   ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR);
 this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME,
   ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : "";
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 int numTasks = state.getPropAsInt(NUM_TASKS_KEY);
 String eventBusId = state.getProp(EVENTBUS_ID_KEY);
 EventBus eventBus = TestingEventBuses.getEventBus(eventBusId);
 Map<String, SourceState> previousStates = state.getPreviousDatasetStatesByUrns();
 for (Map.Entry<String, SourceState> entry : previousStates.entrySet()) {
  JobState.DatasetState datasetState = (JobState.DatasetState) entry.getValue();
  for (TaskState taskState : datasetState.getTaskStates()) {
   if (taskState.contains(Task.PERSISTENT_STATE) && eventBus != null) {
    eventBus.post(new Event(PREVIOUS_STATE_EVENT, taskState.getPropAsInt(Task.PERSISTENT_STATE)));
   }
  }
 }
 List<WorkUnit> workUnits = Lists.newArrayList();
 for (int i = 0; i < numTasks; i++) {
  workUnits.add(createWorkUnit(i, eventBusId));
 }
 return workUnits;
}

/**
 * @return full dump or not
 */
public boolean isWatermarkOverride() {
 return Boolean.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE));
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
 Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1");
 Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2");
 String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY);
 List<String> list = SPLITTER.splitToList(sourceFileList);
 List<WorkUnit> workUnits = Lists.newArrayList();
 for (int i = 0; i < list.size(); i++) {
  WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2);
  workUnit.setProp(SOURCE_FILE_KEY, list.get(i));
  workUnits.add(workUnit);
 }
 if (state.getPropAsBoolean("use.multiworkunit", false)) {
  MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty();
  multiWorkUnit.addWorkUnits(workUnits);
  workUnits.clear();
  workUnits.add(multiWorkUnit);
 }
 return workUnits;
}

@VisibleForTesting
public void initBackfillHiveSource(SourceState state) {
 this.partitionsWhitelist =
   Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(state.getProp(BACKFILL_SOURCE_PARTITION_WHITELIST_KEY,
     StringUtils.EMPTY)));
}

/**
 * Initialize the logger.
 *
 * @param state Source state
 */
protected void initLogger(SourceState state) {
 StringBuilder sb = new StringBuilder();
 sb.append("[");
 sb.append(Strings.nullToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY)));
 sb.append("]");
 MDC.put("sourceInfo", sb.toString());
}

 /**
  * Initialize the logger.
  *
  * @param state
  *            Source state
  */
 private static void initLogger(SourceState state) {
  StringBuilder sb = new StringBuilder();
  sb.append("[");
  sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA)));
  sb.append("_");
  sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY)));
  sb.append("]");
  MDC.put("sourceInfo", sb.toString());
 }
}

 protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) {
  String host = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_HOST_NAME);
  String port = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_PORT);
  String database = sourceState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA);
  String connectionUrl = "jdbc:mysql://" + host.trim() + ":" + port + "/" + database.trim();
  DatasetDescriptor source =
    new DatasetDescriptor(DatasetConstants.PLATFORM_MYSQL, database + "." + entity.getSourceEntityName());
  source.addMetadata(DatasetConstants.CONNECTION_URL, connectionUrl);
  if (lineageInfo.isPresent()) {
   lineageInfo.get().setSource(source, workUnit);
  }
 }
}

/**
 * A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}.
 *
 * Need to be synchronized as access by multiple threads
 */
private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) {
 if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) {
  return false;
 }
 if (this.moveToLatestTopics.isEmpty()) {
  this.moveToLatestTopics.addAll(
    Splitter.on(',').trimResults().omitEmptyStrings().splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET)));
 }
 return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS);
}

/**
 * Create a temporary job directory based on job id or (if not available) UUID
 */
private void initJobDir (SourceState state) throws IOException {
 String tmpBase = state.getProp(MRCompactor.COMPACTION_TMP_DEST_DIR, MRCompactor.DEFAULT_COMPACTION_TMP_DEST_DIR);
 String jobId;
 if (state instanceof JobState) {
  jobId = ((JobState) state).getJobId();
 } else {
  jobId = UUID.randomUUID().toString();
 }
 this.tmpJobDir = new Path (tmpBase, jobId);
 this.fs.mkdirs(this.tmpJobDir);
 state.setProp (MRCompactor.COMPACTION_JOB_DIR, tmpJobDir.toString());
 log.info ("Job dir is created under {}", this.tmpJobDir);
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 List<WorkUnit> workUnits = Lists.newArrayList();
 if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) {
  return workUnits;
 }
 // Create a single snapshot-type extract for all files
 Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY,
   state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable");
 String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL);
 for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) {
  // Create one work unit for each file to pull
  WorkUnit workUnit = WorkUnit.create(extract);
  workUnit.setProp(SOURCE_FILE_KEY, file);
  workUnits.add(workUnit);
 }
 return workUnits;
}

public static KafkaWorkUnitPacker getInstance(AbstractSource<?, ?> source, SourceState state) {
 if (state.contains(KAFKA_WORKUNIT_PACKER_TYPE)) {
  String packerTypeStr = state.getProp(KAFKA_WORKUNIT_PACKER_TYPE);
  Optional<PackerType> packerType = Enums.getIfPresent(PackerType.class, packerTypeStr);
  if (packerType.isPresent()) {
   return getInstance(packerType.get(), source, state);
  }
  throw new IllegalArgumentException("WorkUnit packer type " + packerTypeStr + " not found");
 }
 return getInstance(DEFAULT_PACKER_TYPE, source, state);
}

private KafkaWorkUnitSizeEstimator getWorkUnitSizeEstimator() {
 if (this.state.contains(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE)) {
  String sizeEstimatorTypeString = this.state.getProp(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE);
  Optional<SizeEstimatorType> sizeEstimatorType =
    Enums.getIfPresent(SizeEstimatorType.class, sizeEstimatorTypeString);
  if (sizeEstimatorType.isPresent()) {
   return getWorkUnitSizeEstimator(sizeEstimatorType.get());
  }
  throw new IllegalArgumentException("WorkUnit size estimator type " + sizeEstimatorType + " not found");
 }
 return getWorkUnitSizeEstimator(DEFAULT_SIZE_ESTIMATOR_TYPE);
}

/**
 * Get snapshot high water mark
 *
 * @param watermarkType Watermark type
 * @return snapshot high water mark
 */
private long getSnapshotHighWatermark(WatermarkType watermarkType) {
 LOG.debug("Getting snapshot high water mark");
 if (isSimpleWatermark(watermarkType)) {
  return ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
 }
 String timeZone = this.state.getProp(ConfigurationKeys.SOURCE_TIMEZONE);
 return Long.parseLong(Utils.dateTimeToString(getCurrentTime(timeZone), WATERMARKTIMEFORMAT, timeZone));
}

/**
 * Get the global partition of the whole data set, which has the global low and high watermarks
 *
 * @param previousWatermark previous watermark for computing the low watermark of current run
 * @return a Partition instance
 */
public Partition getGlobalPartition(long previousWatermark) {
 ExtractType extractType =
   ExtractType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase());
 WatermarkType watermarkType = WatermarkType.valueOf(
   state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE)
     .toUpperCase());
 WatermarkPredicate watermark = new WatermarkPredicate(null, watermarkType);
 int deltaForNextWatermark = watermark.getDeltaNumForNextWatermark();
 long lowWatermark = getLowWatermark(extractType, watermarkType, previousWatermark, deltaForNextWatermark);
 long highWatermark = getHighWatermark(extractType, watermarkType);
 return new Partition(lowWatermark, highWatermark, true, hasUserSpecifiedHighWatermark);
}

Popular methods of SourceState

setProp
contains
getPreviousWorkUnitStates
Get a List of previous WorkUnitStates for a given datasetUrn.
getPropAsBoolean
getPropAsInt
getProperties
<init>
Constructor.
getPropAsLong
getBroker
getPreviousWorkUnitStatesByDatasetUrns
Get a Map from dataset URNs (as being specified by ConfigurationKeys#DATASET_URN_KEYto the WorkUnitS
write
getPreviousDatasetStatesByUrns

Popular in Java

Parsing JSON documents to java classes using gson
runOnUiThread (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
requestLocationUpdates (LocationManager)
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
TimeZone (java.util)
TimeZone represents a time zone offset, and also figures out daylight savings. Typically, you get a
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Handler (java.util.logging)
A Handler object accepts a logging request and exports the desired messages to a target, for example
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
From CI to AI: The AI layer in your organization

How to use getPropmethodin org.apache.gobblin.configuration.SourceState

Best Java code snippets using org.apache.gobblin.configuration.SourceState.getProp (Showing top 20 results out of 315)

How to use
getProp
method
in
org.apache.gobblin.configuration.SourceState