/** * A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}. * * Need to be synchronized as access by multiple threads */ private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) { if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) { return false; } if (this.moveToLatestTopics.isEmpty()) { this.moveToLatestTopics.addAll( Splitter.on(',').trimResults().omitEmptyStrings().splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET))); } return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS); }
private static SourceState getTestState(String dbName) { SourceState testState = new SourceState(); testState.setProp("hive.dataset.database", dbName); testState.setProp("hive.dataset.table.pattern", "*"); testState.setProp(ConfigurationKeys.JOB_ID_KEY, "testJobId"); return testState; }
/** * Gobblin calls the {@link Source#getWorkunits(SourceState)} method after creating a {@link Source} object with a * blank constructor, so any custom initialization of the object needs to be done here. */ protected void init(SourceState state) { retriever.init(state); try { initFileSystemHelper(state); } catch (FileBasedHelperException e) { Throwables.propagate(e); } AvroFsHelper fsHelper = (AvroFsHelper) this.fsHelper; this.fs = fsHelper.getFileSystem(); this.sourceState = state; this.lowWaterMark = getLowWaterMark(state.getPreviousWorkUnitStates(), state.getProp(DATE_PARTITIONED_SOURCE_MIN_WATERMARK_VALUE, String.valueOf(DEFAULT_DATE_PARTITIONED_SOURCE_MIN_WATERMARK_VALUE))); this.maxFilesPerJob = state.getPropAsInt(DATE_PARTITIONED_SOURCE_MAX_FILES_PER_JOB, DEFAULT_DATE_PARTITIONED_SOURCE_MAX_FILES_PER_JOB); this.maxWorkUnitsPerJob = state.getPropAsInt(DATE_PARTITIONED_SOURCE_MAX_WORKUNITS_PER_JOB, DEFAULT_DATE_PARTITIONED_SOURCE_MAX_WORKUNITS_PER_JOB); this.tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()); this.fileCount = 0; this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); }
private static SourceState getCombinedState(SourceState state, State tableSpecificState) { if (tableSpecificState == null) { return state; } SourceState combinedState = new SourceState(state, state.getPreviousDatasetStatesByUrns(), state.getPreviousWorkUnitStates()); combinedState.addAll(tableSpecificState); return combinedState; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY)) { state.setProp(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY, KafkaGsonDeserializer.class.getName()); } return super.getWorkunits(state); }
@Override public void init(SourceState state) { String regexPattern = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN); Preconditions.checkNotNull(regexPattern, "Must specify a regex pattern in " + PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN ); this.leadTime = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state); this.pattern = Pattern.compile(regexPattern); this.helper = new HadoopFsHelper(state); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR); this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME, ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : ""; }
this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker()); state.setProp(SlaEventKeys.SOURCE_URI, sourceFs.getUri()); state.setProp(SlaEventKeys.DESTINATION_URI, targetFs.getUri()); long maxSizePerBin = state.getPropAsLong(MAX_SIZE_MULTI_WORKUNITS, 0); long maxWorkUnitsPerMultiWorkUnit = state.getPropAsLong(MAX_WORK_UNITS_PER_BIN, 50); final long minWorkUnitWeight = Math.max(1, maxSizePerBin / maxWorkUnitsPerMultiWorkUnit); final Optional<CopyableFileWatermarkGenerator> watermarkGenerator = CopyableFileWatermarkHelper.getCopyableFileWatermarkGenerator(state); int maxThreads = state.getPropAsInt(MAX_CONCURRENT_LISTING_SERVICES, DEFAULT_MAX_CONCURRENT_LISTING_SERVICES); final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()).build(); .instantiateDatasetFinder(state.getProperties(), sourceFs, DEFAULT_DATASET_PROFILE_CLASS_KEY, this.eventSubmitter, state); String filesetWuGeneratorAlias = state.getProp(ConfigurationKeys.COPY_SOURCE_FILESET_WU_GENERATOR_CLASS, FileSetWorkUnitGenerator.class.getName()); Iterator<Callable<Void>> callableIterator = Iterators.transform(prioritizedFileSets, new Function<FileSet<CopyEntity>, Callable<Void>>() { if (state.contains(SIMULATE) && state.getPropAsBoolean(SIMULATE)) { log.info("Simulate mode enabled. Will not execute the copy."); for (Map.Entry<FileSet<CopyEntity>, Collection<WorkUnit>> entry : workUnitsMap.asMap().entrySet()) {
if (Iterables.isEmpty(state.getPreviousWorkUnitStates())) { return ImmutableList.of(); if (state.contains(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY)) { workUnitRetryPolicy = WorkUnitRetryPolicy.forName(state.getProp(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY)); } else { boolean retryFailedWorkUnits = state.getPropAsBoolean(ConfigurationKeys.WORK_UNIT_RETRY_ENABLED_KEY, true); workUnitRetryPolicy = retryFailedWorkUnits ? WorkUnitRetryPolicy.ALWAYS : WorkUnitRetryPolicy.NEVER; for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) { if (workUnitState.getWorkingState() != WorkUnitState.WorkingState.COMMITTED) { if (state.getPropAsBoolean(ConfigurationKeys.OVERWRITE_CONFIGS_IN_STATESTORE, ConfigurationKeys.DEFAULT_OVERWRITE_CONFIGS_IN_STATESTORE)) { .forName(state.getProp(ConfigurationKeys.JOB_COMMIT_POLICY_KEY, ConfigurationKeys.DEFAULT_JOB_COMMIT_POLICY)); if ((workUnitRetryPolicy == WorkUnitRetryPolicy.ON_COMMIT_ON_PARTIAL_SUCCESS && jobCommitPolicy == JobCommitPolicy.COMMIT_ON_PARTIAL_SUCCESS)
@Override protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) { WatermarkType watermarkType = WatermarkType.valueOf( state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, ConfigurationKeys.DEFAULT_WATERMARK_TYPE) .toUpperCase()); String watermarkColumn = state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY); int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS); int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE); if (watermarkType == WatermarkType.SIMPLE || Strings.isNullOrEmpty(watermarkColumn) || !state.getPropAsBoolean( ENABLE_DYNAMIC_PARTITIONING) || maxPartitions <= 1) { return super.generateWorkUnits(sourceEntity, state, previousWatermark); histogramAdjust.add(group); if (histogramAdjust.getTotalRecordCount() > state .getPropAsLong(EARLY_STOP_TOTAL_RECORDS_LIMIT, DEFAULT_EARLY_STOP_TOTAL_RECORDS_LIMIT)) { break; HistogramGroup lastPlusOne = histogram.get(histogramAdjust.getGroups().size()); long earlyStopHighWatermark = Long.parseLong(Utils.toDateTimeFormat(lastPlusOne.getKey(), SECONDS_FORMAT, Partitioner.WATERMARKTIMEFORMAT)); log.info("Job {} will be stopped earlier. [LW : {}, early-stop HW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), earlyStopHighWatermark, expectedHighWatermark); this.isEarlyStopped = true; expectedHighWatermark = earlyStopHighWatermark; } else { log.info("Job {} will be finished in a single run. [LW : {}, expected HW : {}]", state.getProp(ConfigurationKeys.JOB_NAME_KEY), partition.getLowWatermark(), expectedHighWatermark); state.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true); state.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, specifiedPartitions);
@Override public List<WorkUnit> getWorkunits(SourceState state) { this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class); this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker()); if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) { String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString()); tableType = Extract.TableType.valueOf(tableTypeStr); extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME); } else { isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY); kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, ""); this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver .resolveClass( state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance(); int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT); ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG))); if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT,
@VisibleForTesting public void initialize(SourceState state) throws IOException { this.updateProvider = UpdateProviderFactory.create(state); this.metricContext = Instrumented.getMetricContext(state, HiveSource.class); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build(); this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state); this.workunits = Lists.newArrayList(); this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class, state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS)) .createFromState(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class, state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(), this.eventSubmitter); int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS); this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis(); this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY, DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER)); silenceHiveLoggers(); }
SourceState sourceState = new SourceState(); sourceState.setProp(LoopingDatasetFinderSource.MAX_WORK_UNITS_PER_RUN_KEY, 3); Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates(); Mockito.doReturn(workUnitStates).when(sourceStateSpy).getPreviousWorkUnitStates();
@Test public void testSubmitUnfulfilledRequestEvents() throws IOException, NoSuchMethodException, InvocationTargetException, IllegalAccessException { SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///"); state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///"); state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir"); state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyablePartitionableDatasedFinder.class.getCanonicalName()); state.setProp(CopySource.MAX_CONCURRENT_LISTING_SERVICES, 2); state.setProp(CopyConfiguration.MAX_COPY_PREFIX + ".size", "50"); state.setProp(CopyConfiguration.MAX_COPY_PREFIX + ".copyEntities", 2); state.setProp(CopyConfiguration.STORE_REJECTED_REQUESTS_KEY, RequestAllocatorConfig.StoreRejectedRequestsConfig.ALL.name().toLowerCase()); state.setProp(ConfigurationKeys.METRICS_CUSTOM_BUILDERS, "org.apache.gobblin.metrics.ConsoleEventReporterFactory"); .getPropAsInt(CopySource.MAX_CONCURRENT_LISTING_SERVICES, CopySource.DEFAULT_MAX_CONCURRENT_LISTING_SERVICES); final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()).build(); .instantiateDatasetFinder(state.getProperties(), sourceFs, CopySource.DEFAULT_DATASET_PROFILE_CLASS_KEY, eventSubmitter, state);
@Override public List<WorkUnit> getWorkunits(SourceState state) { JobConf jobConf = new JobConf(new Configuration()); for (String key : state.getPropertyNames()) { jobConf.set(key, state.getProp(key)); if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { FileInputFormat.addInputPath(jobConf, new Path(inputPath)); InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt( HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED)); if (fileSplits == null || fileSplits.length == 0) { Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null; String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);
/** * Get low water mark * * @param watermarkType Watermark type * @param previousWatermark Previous water mark * @param deltaForNextWatermark delta number for next water mark * @return Previous watermark (fallback to {@link ConfigurationKeys#SOURCE_QUERYBASED_START_VALUE} iff previous watermark is unavailable) */ private long getSnapshotLowWatermark(WatermarkType watermarkType, long previousWatermark, int deltaForNextWatermark) { LOG.debug("Getting snapshot low water mark"); String timeZone = this.state.getProp(ConfigurationKeys.SOURCE_TIMEZONE, ConfigurationKeys.DEFAULT_SOURCE_TIMEZONE); if (isPreviousWatermarkExists(previousWatermark)) { if (isSimpleWatermark(watermarkType)) { return previousWatermark + deltaForNextWatermark - this.state .getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0); } DateTime wm = Utils.toDateTime(previousWatermark, WATERMARKTIMEFORMAT, timeZone).plusSeconds( (deltaForNextWatermark - this.state .getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0))); return Long.parseLong(Utils.dateTimeToString(wm, WATERMARKTIMEFORMAT, timeZone)); } // If previous watermark is not found, override with the start value // (irrespective of source.is.watermark.override flag) long startValue = Utils.getLongWithCurrentDate(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE), timeZone); LOG.info("Overriding low water mark with the given start value: " + startValue); return startValue; }
/** * If full dump is true, the low watermark will be based on {@link ConfigurationKeys#SOURCE_QUERYBASED_START_VALUE} * Otherwise it will base on the previous watermark. Please refer to {@link Partitioner#getLowWatermark(ExtractType, WatermarkType, long, int)} * @return full dump or not */ public boolean isFullDump() { return Boolean.valueOf(this.state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY)); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { FileSystem fs = HadoopUtils.getSourceFileSystem(state); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); if (state.contains(COPY_TABLE_KEY)) { HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state); WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_VIEW)) { HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state); WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_QUERY)) { String query = state.getProp(MATERIALIZE_QUERY); WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null)); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } } catch (IOException ioe) { throw new RuntimeException(ioe); } throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW)); }
submitCycleCompletionEvent(); this.maxWorkUnits = state .getPropAsInt(ComplianceConfigurationKeys.MAX_WORKUNITS_KEY, ComplianceConfigurationKeys.DEFAULT_MAX_WORKUNITS); this.maxWorkUnitExecutionAttempts = state .getPropAsInt(ComplianceConfigurationKeys.MAX_WORKUNIT_EXECUTION_ATTEMPTS_KEY, ComplianceConfigurationKeys.DEFAULT_MAX_WORKUNIT_EXECUTION_ATTEMPTS); String datasetFinderClass = state.getProp(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS, HivePartitionFinder.class.getName()); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, datasetFinderClass, state); populateDatasets(); String policyClass = state.getProp(ComplianceConfigurationKeys.PURGE_POLICY_CLASS, HivePurgerPolicy.class.getName()); this.policy = GobblinConstructorUtils.invokeConstructor(PurgePolicy.class, policyClass, this.lowWatermark); this.shouldProxy = state.getPropAsBoolean(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SHOULD_PROXY, ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_DEFAULT_SHOULD_PROXY); if (!this.shouldProxy) {
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY)) { state.setProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, ConvertibleHiveDatasetFinder.class.getName()); } if (!state.contains(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY)) { state.setProp(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, "hive.conversion.avro"); } this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker()); List<WorkUnit> workunits = super.getWorkunits(state); for (WorkUnit workUnit : workunits) { if (LineageUtils.shouldSetLineageInfo(workUnit)) { setSourceLineageInfo(workUnit, this.lineageInfo); } } return workunits; }
super.setProp(ConfigurationKeys.EXTRACT_EXTRACT_ID_KEY, extractId); for (WorkUnitState pre : state.getPreviousWorkUnitStates()) { Extract previousExtract = pre.getWorkunit().getExtract(); if (previousExtract.getNamespace().equals(namespace) && previousExtract.getTable().equals(table)) { if (state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY) && !state.contains(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY)) { super.setProp(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY, System.currentTimeMillis());