/** * A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}. * * Need to be synchronized as access by multiple threads */ private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) { if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) { return false; } if (this.moveToLatestTopics.isEmpty()) { this.moveToLatestTopics.addAll( Splitter.on(',').trimResults().omitEmptyStrings().splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET))); } return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY)) { state.setProp(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY, KafkaGsonDeserializer.class.getName()); } return super.getWorkunits(state); }
public static KafkaWorkUnitPacker getInstance(AbstractSource<?, ?> source, SourceState state) { if (state.contains(KAFKA_WORKUNIT_PACKER_TYPE)) { String packerTypeStr = state.getProp(KAFKA_WORKUNIT_PACKER_TYPE); Optional<PackerType> packerType = Enums.getIfPresent(PackerType.class, packerTypeStr); if (packerType.isPresent()) { return getInstance(packerType.get(), source, state); } throw new IllegalArgumentException("WorkUnit packer type " + packerTypeStr + " not found"); } return getInstance(DEFAULT_PACKER_TYPE, source, state); }
private KafkaWorkUnitSizeEstimator getWorkUnitSizeEstimator() { if (this.state.contains(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE)) { String sizeEstimatorTypeString = this.state.getProp(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE); Optional<SizeEstimatorType> sizeEstimatorType = Enums.getIfPresent(SizeEstimatorType.class, sizeEstimatorTypeString); if (sizeEstimatorType.isPresent()) { return getWorkUnitSizeEstimator(sizeEstimatorType.get()); } throw new IllegalArgumentException("WorkUnit size estimator type " + sizeEstimatorType + " not found"); } return getWorkUnitSizeEstimator(DEFAULT_SIZE_ESTIMATOR_TYPE); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) { return workUnits; } // Create a single snapshot-type extract for all files Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable"); String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL); for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) { // Create one work unit for each file to pull WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(SOURCE_FILE_KEY, file); workUnits.add(workUnit); } return workUnits; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY)) { state.setProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, ConvertibleHiveDatasetFinder.class.getName()); } if (!state.contains(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY)) { state.setProp(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, "hive.conversion.avro"); } this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker()); List<WorkUnit> workunits = super.getWorkunits(state); for (WorkUnit workUnit : workunits) { if (LineageUtils.shouldSetLineageInfo(workUnit)) { setSourceLineageInfo(workUnit, this.lineageInfo); } } return workunits; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY)) { state.setProp(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY, HiveSerDeWrapper.getDeserializer(state).getInputFormatClassName()); } return super.getWorkunits(state); }
Job job = Job.getInstance(new Configuration()); if (state.contains(FILE_INPUT_PATHS_KEY)) { for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) { FileInputFormat.addInputPath(job, new Path(inputPath)); Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null; String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
logFilesToPull(filesToPull); int numPartitions = state.contains(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) && state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) <= filesToPull.size() ? state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) : filesToPull.size();
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { FileSystem fs = HadoopUtils.getSourceFileSystem(state); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); if (state.contains(COPY_TABLE_KEY)) { HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state); WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_VIEW)) { HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state); WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_QUERY)) { String query = state.getProp(MATERIALIZE_QUERY); WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null)); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } } catch (IOException ioe) { throw new RuntimeException(ioe); } throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW)); }
if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { FileInputFormat.addInputPath(jobConf, new Path(inputPath)); Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null; String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = new ArrayList<>(); if (state.contains(TEST_WORKUNIT_PERSISTENCE)) { testSkipWorkUnitPersistence(state); return workUnits; } for (int i = 0; i < NUMBER_OF_WORKUNITS; i++) { WorkUnit workUnit = WorkUnit.createEmpty(); if (i < NUMBER_OF_SKIP_WORKUNITS) { workUnit.skip(); } workUnits.add(workUnit); } return workUnits; }
&& !state.contains(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY)) { super.setProp(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY, System.currentTimeMillis());
/** * This method creates the list of all work units needed for the current execution. * Fresh work units are created for each partition starting from watermark and failed work units from the * previous run will be added to the list. */ protected void createWorkUnits(SourceState state) throws IOException { createWorkunitsFromPreviousState(state); if (this.datasets.isEmpty()) { return; } for (HivePartitionDataset dataset : this.datasets) { Optional<String> owner = dataset.getOwner(); if (workUnitsExceeded()) { log.info("Workunits exceeded"); setJobWatermark(state, dataset.datasetURN()); return; } if (!this.policy.shouldPurge(dataset)) { continue; } WorkUnit workUnit = createNewWorkUnit(dataset); log.info("Created new work unit with partition " + workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME)); this.workUnitMap.put(workUnit.getProp(ComplianceConfigurationKeys.PARTITION_NAME), workUnit); this.workUnitsCreatedCount++; } if (!state.contains(ComplianceConfigurationKeys.HIVE_PURGER_WATERMARK)) { this.setJobWatermark(state, ComplianceConfigurationKeys.NO_PREVIOUS_WATERMARK); } }
if (state.contains(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY)) {
if (state.contains(SIMULATE) && state.getPropAsBoolean(SIMULATE)) { log.info("Simulate mode enabled. Will not execute the copy."); for (Map.Entry<FileSet<CopyEntity>, Collection<WorkUnit>> entry : workUnitsMap.asMap().entrySet()) {
/** * A topic can be configured to move to the latest offset in {@link #TOPICS_MOVE_TO_LATEST_OFFSET}. * * Need to be synchronized as access by multiple threads */ private synchronized boolean shouldMoveToLatestOffset(KafkaPartition partition, SourceState state) { if (!state.contains(TOPICS_MOVE_TO_LATEST_OFFSET)) { return false; } if (this.moveToLatestTopics.isEmpty()) { this.moveToLatestTopics.addAll( Splitter.on(',').trimResults().omitEmptyStrings().splitToList(state.getProp(TOPICS_MOVE_TO_LATEST_OFFSET))); } return this.moveToLatestTopics.contains(partition.getTopicName()) || this.moveToLatestTopics.contains(ALL_TOPICS); }
private KafkaWorkUnitSizeEstimator getWorkUnitSizeEstimator() { if (this.state.contains(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE)) { String sizeEstimatorTypeString = this.state.getProp(KAFKA_WORKUNIT_SIZE_ESTIMATOR_TYPE); Optional<SizeEstimatorType> sizeEstimatorType = Enums.getIfPresent(SizeEstimatorType.class, sizeEstimatorTypeString); if (sizeEstimatorType.isPresent()) { return getWorkUnitSizeEstimator(sizeEstimatorType.get()); } throw new IllegalArgumentException("WorkUnit size estimator type " + sizeEstimatorType + " not found"); } return getWorkUnitSizeEstimator(DEFAULT_SIZE_ESTIMATOR_TYPE); }
public static KafkaWorkUnitPacker getInstance(AbstractSource<?, ?> source, SourceState state) { if (state.contains(KAFKA_WORKUNIT_PACKER_TYPE)) { String packerTypeStr = state.getProp(KAFKA_WORKUNIT_PACKER_TYPE); Optional<PackerType> packerType = Enums.getIfPresent(PackerType.class, packerTypeStr); if (packerType.isPresent()) { return getInstance(packerType.get(), source, state); } throw new IllegalArgumentException("WorkUnit packer type " + packerTypeStr + " not found"); } return getInstance(DEFAULT_PACKER_TYPE, source, state); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY)) { state.setProp(HadoopFileInputSource.FILE_INPUT_FORMAT_CLASS_KEY, HiveSerDeWrapper.getDeserializer(state).getInputFormatClassName()); } return super.getWorkunits(state); }