/** * true if water mark columns and water mark type provided * * @return true if water mark exists */ private boolean isWatermarkExists() { if (!Strings.isNullOrEmpty(this.state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY)) && !Strings .isNullOrEmpty(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE))) { return true; } return false; }
@Override public void init(SourceState state) { String regexPattern = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN); Preconditions.checkNotNull(regexPattern, "Must specify a regex pattern in " + PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN ); this.leadTime = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state); this.pattern = Pattern.compile(regexPattern); this.helper = new HadoopFsHelper(state); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR); this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME, ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : ""; }
@VisibleForTesting public void initialize(SourceState state) throws IOException { this.updateProvider = UpdateProviderFactory.create(state); this.metricContext = Instrumented.getMetricContext(state, HiveSource.class); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build(); this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state); this.workunits = Lists.newArrayList(); this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class, state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS)) .createFromState(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class, state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(), this.eventSubmitter); int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS); this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis(); this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY, DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER)); silenceHiveLoggers(); }
@Override public WorkUnitStream getWorkunitStream(SourceState state) { int numTasks = state.getPropAsInt(NUM_TASKS_KEY); String eventBusId = state.getProp(EventBusPublishingTaskFactory.EVENTBUS_ID_KEY); EventBus eventBus = TestingEventBuses.getEventBus(eventBusId); return new BasicWorkUnitStream.Builder(new WorkUnitIterator(eventBus, eventBusId, numTasks)).build(); }
/** * If full dump is true, the low watermark will be based on {@link ConfigurationKeys#SOURCE_QUERYBASED_START_VALUE} * Otherwise it will base on the previous watermark. Please refer to {@link Partitioner#getLowWatermark(ExtractType, WatermarkType, long, int)} * @return full dump or not */ public boolean isFullDump() { return Boolean.valueOf(this.state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY)); }
@Override public void init(SourceState state) { DateTimeZone.setDefault(DateTimeZone .forID(state.getProp(ConfigurationKeys.SOURCE_TIMEZONE, ConfigurationKeys.DEFAULT_SOURCE_TIMEZONE))); initDatePartition(state); this.sourcePartitionPrefix = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PREFIX, StringUtils.EMPTY); this.sourcePartitionSuffix = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_SUFFIX, StringUtils.EMPTY); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.leadTimeDuration = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state); this.helper = new HadoopFsHelper(state); this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR); this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME, ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : ""; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { int numTasks = state.getPropAsInt(NUM_TASKS_KEY); String eventBusId = state.getProp(EVENTBUS_ID_KEY); EventBus eventBus = TestingEventBuses.getEventBus(eventBusId); Map<String, SourceState> previousStates = state.getPreviousDatasetStatesByUrns(); for (Map.Entry<String, SourceState> entry : previousStates.entrySet()) { JobState.DatasetState datasetState = (JobState.DatasetState) entry.getValue(); for (TaskState taskState : datasetState.getTaskStates()) { if (taskState.contains(Task.PERSISTENT_STATE) && eventBus != null) { eventBus.post(new Event(PREVIOUS_STATE_EVENT, taskState.getPropAsInt(Task.PERSISTENT_STATE))); } } } List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < numTasks; i++) { workUnits.add(createWorkUnit(i, eventBusId)); } return workUnits; }
/** * @return full dump or not */ public boolean isWatermarkOverride() { return Boolean.valueOf(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE)); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1"); Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2"); String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY); List<String> list = SPLITTER.splitToList(sourceFileList); List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < list.size(); i++) { WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2); workUnit.setProp(SOURCE_FILE_KEY, list.get(i)); workUnits.add(workUnit); } if (state.getPropAsBoolean("use.multiworkunit", false)) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); multiWorkUnit.addWorkUnits(workUnits); workUnits.clear(); workUnits.add(multiWorkUnit); } return workUnits; }
@VisibleForTesting public void initBackfillHiveSource(SourceState state) { this.partitionsWhitelist = Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(state.getProp(BACKFILL_SOURCE_PARTITION_WHITELIST_KEY, StringUtils.EMPTY))); }
/** * Initialize the logger. * * @param state Source state */ protected void initLogger(SourceState state) { StringBuilder sb = new StringBuilder(); sb.append("["); sb.append(Strings.nullToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY))); sb.append("]"); MDC.put("sourceInfo", sb.toString()); }
/** * Initialize the logger. * * @param state * Source state */ private static void initLogger(SourceState state) { StringBuilder sb = new StringBuilder(); sb.append("["); sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA))); sb.append("_"); sb.append(StringUtils.stripToEmpty(state.getProp(ConfigurationKeys.SOURCE_ENTITY))); sb.append("]"); MDC.put("sourceInfo", sb.toString()); } }
protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) { String host = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_HOST_NAME); String port = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_PORT); String database = sourceState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA); String connectionUrl = "jdbc:mysql://" + host.trim() + ":" + port + "/" + database.trim(); DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_MYSQL, database + "." + entity.getSourceEntityName()); source.addMetadata(DatasetConstants.CONNECTION_URL, connectionUrl); if (lineageInfo.isPresent()) { lineageInfo.get().setSource(source, workUnit); } } }
/** * A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}. * * Need to be synchronized as access by multiple threads */ private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) { if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) { return false; } if (this.moveToLatestTopics.isEmpty()) { this.moveToLatestTopics.addAll( Splitter.on(',').trimResults().omitEmptyStrings().splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET))); } return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS); }
/** * Create a temporary job directory based on job id or (if not available) UUID */ private void initJobDir (SourceState state) throws IOException { String tmpBase = state.getProp(MRCompactor.COMPACTION_TMP_DEST_DIR, MRCompactor.DEFAULT_COMPACTION_TMP_DEST_DIR); String jobId; if (state instanceof JobState) { jobId = ((JobState) state).getJobId(); } else { jobId = UUID.randomUUID().toString(); } this.tmpJobDir = new Path (tmpBase, jobId); this.fs.mkdirs(this.tmpJobDir); state.setProp (MRCompactor.COMPACTION_JOB_DIR, tmpJobDir.toString()); log.info ("Job dir is created under {}", this.tmpJobDir); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) { return workUnits; } // Create a single snapshot-type extract for all files Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable"); String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL); for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) { // Create one work unit for each file to pull WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(SOURCE_FILE_KEY, file); workUnits.add(workUnit); } return workUnits; }
public static KafkaWorkUnitPacker getInstance(AbstractSource<?, ?> source, SourceState state) { if (state.contains(KAFKA_WORKUNIT_PACKER_TYPE)) { String packerTypeStr = state.getProp(KAFKA_WORKUNIT_PACKER_TYPE); Optional<PackerType> packerType = Enums.getIfPresent(PackerType.class, packerTypeStr); if (packerType.isPresent()) { return getInstance(packerType.get(), source, state); } throw new IllegalArgumentException("WorkUnit packer type " + packerTypeStr + " not found"); } return getInstance(DEFAULT_PACKER_TYPE, source, state); }
private KafkaWorkUnitSizeEstimator getWorkUnitSizeEstimator() { if (this.state.contains(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE)) { String sizeEstimatorTypeString = this.state.getProp(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE); Optional<SizeEstimatorType> sizeEstimatorType = Enums.getIfPresent(SizeEstimatorType.class, sizeEstimatorTypeString); if (sizeEstimatorType.isPresent()) { return getWorkUnitSizeEstimator(sizeEstimatorType.get()); } throw new IllegalArgumentException("WorkUnit size estimator type " + sizeEstimatorType + " not found"); } return getWorkUnitSizeEstimator(DEFAULT_SIZE_ESTIMATOR_TYPE); }
/** * Get snapshot high water mark * * @param watermarkType Watermark type * @return snapshot high water mark */ private long getSnapshotHighWatermark(WatermarkType watermarkType) { LOG.debug("Getting snapshot high water mark"); if (isSimpleWatermark(watermarkType)) { return ConfigurationKeys.DEFAULT_WATERMARK_VALUE; } String timeZone = this.state.getProp(ConfigurationKeys.SOURCE_TIMEZONE); return Long.parseLong(Utils.dateTimeToString(getCurrentTime(timeZone), WATERMARKTIMEFORMAT, timeZone)); }
/** * Get the global partition of the whole data set, which has the global low and high watermarks * * @param previousWatermark previous watermark for computing the low watermark of current run * @return a Partition instance */ public Partition getGlobalPartition(long previousWatermark) { ExtractType extractType = ExtractType.valueOf(state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE).toUpperCase()); WatermarkType watermarkType = WatermarkType.valueOf( state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE) .toUpperCase()); WatermarkPredicate watermark = new WatermarkPredicate(null, watermarkType); int deltaForNextWatermark = watermark.getDeltaNumForNextWatermark(); long lowWatermark = getLowWatermark(extractType, watermarkType, previousWatermark, deltaForNextWatermark); long highWatermark = getHighWatermark(extractType, watermarkType); return new Partition(lowWatermark, highWatermark, true, hasUserSpecifiedHighWatermark); }