@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < state.getPropAsInt(NUM_WORK_UNITS, 1); i++) { workUnits.add(new WorkUnit()); } return workUnits; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { int numWorkUnits = state.getPropAsInt(NUM_WORK_UNITS_KEY, DEFAULT_NUM_WORK_UNITS); Extract extract = new Extract(TableType.APPEND_ONLY, StressTestingSource.class.getPackage().getName(), StressTestingSource.class.getSimpleName()); List<WorkUnit> wus = new ArrayList<>(numWorkUnits); for (int i = 1; i <= numWorkUnits; ++i) { WorkUnit wu = new WorkUnit(extract); wus.add(wu); } return wus; }
/** * Gobblin calls the {@link Source#getWorkunits(SourceState)} method after creating a {@link Source} object with a * blank constructor, so any custom initialization of the object needs to be done here. */ protected void init(SourceState state) { retriever.init(state); try { initFileSystemHelper(state); } catch (FileBasedHelperException e) { Throwables.propagate(e); } AvroFsHelper fsHelper = (AvroFsHelper) this.fsHelper; this.fs = fsHelper.getFileSystem(); this.sourceState = state; this.lowWaterMark = getLowWaterMark(state.getPreviousWorkUnitStates(), state.getProp(DATE_PARTITIONED_SOURCE_MIN_WATERMARK_VALUE, String.valueOf(DEFAULT_DATE_PARTITIONED_SOURCE_MIN_WATERMARK_VALUE))); this.maxFilesPerJob = state.getPropAsInt(DATE_PARTITIONED_SOURCE_MAX_FILES_PER_JOB, DEFAULT_DATE_PARTITIONED_SOURCE_MAX_FILES_PER_JOB); this.maxWorkUnitsPerJob = state.getPropAsInt(DATE_PARTITIONED_SOURCE_MAX_WORKUNITS_PER_JOB, DEFAULT_DATE_PARTITIONED_SOURCE_MAX_WORKUNITS_PER_JOB); this.tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()); this.fileCount = 0; this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); }
state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
/** * Get low water mark * * @param watermarkType Watermark type * @param previousWatermark Previous water mark * @param deltaForNextWatermark delta number for next water mark * @return Previous watermark (fallback to {@link ConfigurationKeys#SOURCE_QUERYBASED_START_VALUE} iff previous watermark is unavailable) */ private long getSnapshotLowWatermark(WatermarkType watermarkType, long previousWatermark, int deltaForNextWatermark) { LOG.debug("Getting snapshot low water mark"); String timeZone = this.state.getProp(ConfigurationKeys.SOURCE_TIMEZONE, ConfigurationKeys.DEFAULT_SOURCE_TIMEZONE); if (isPreviousWatermarkExists(previousWatermark)) { if (isSimpleWatermark(watermarkType)) { return previousWatermark + deltaForNextWatermark - this.state .getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0); } DateTime wm = Utils.toDateTime(previousWatermark, WATERMARKTIMEFORMAT, timeZone).plusSeconds( (deltaForNextWatermark - this.state .getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, 0))); return Long.parseLong(Utils.dateTimeToString(wm, WATERMARKTIMEFORMAT, timeZone)); } // If previous watermark is not found, override with the start value // (irrespective of source.is.watermark.override flag) long startValue = Utils.getLongWithCurrentDate(this.state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE), timeZone); LOG.info("Overriding low water mark with the given start value: " + startValue); return startValue; }
.toUpperCase()); int interval = getUpdatedInterval(this.state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL, 0), extractType, watermarkType); int sourceMaxAllowedPartitions = this.state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 0); int maxPartitions = (sourceMaxAllowedPartitions != 0 ? sourceMaxAllowedPartitions : ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS);
List<String> filesToPull = Lists.newArrayList(); int maxFilesToPull = state.getPropAsInt(ConfigurationKeys.SOURCE_FILEBASED_MAX_FILES_PER_RUN, Integer.MAX_VALUE); int filesSelectedForPull = 0; if (currentFsSnapshot.size() > maxFilesToPull) { && state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) <= filesToPull.size() ? state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS) : filesToPull.size(); if (numPartitions <= 0) { throw new IllegalArgumentException("The number of partitions should be positive");
final int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS); final int probeLimit = state.getPropAsInt(DYNAMIC_PROBING_LIMIT, DEFAULT_DYNAMIC_PROBING_LIMIT); final int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE); final Histogram outputHistogram = new Histogram(); final double probeTargetRatio = state.getPropAsDouble(PROBE_TARGET_RATIO, DEFAULT_PROBE_TARGET_RATIO);
public void run () { try { Stopwatch stopwatch = Stopwatch.createStarted(); int threads = this.state.getPropAsInt(CompactionVerifier.COMPACTION_VERIFICATION_THREADS, 5); long timeOutInMinute = this.state.getPropAsLong(CompactionVerifier.COMPACTION_VERIFICATION_TIMEOUT_MINUTES, 30); long iterationCountLimit = this.state.getPropAsLong(CompactionVerifier.COMPACTION_VERIFICATION_ITERATION_COUNT_LIMIT, Integer.MAX_VALUE);
public WorkUnitStream getWorkunitStream(SourceState state, boolean isDatasetStateStoreEnabled) { this.isDatasetStateStoreEnabled = isDatasetStateStoreEnabled; try { int maximumWorkUnits = state.getPropAsInt(MAX_WORK_UNITS_PER_RUN_KEY, MAX_WORK_UNITS_PER_RUN); Preconditions.checkArgument(maximumWorkUnits > 0, "Max work units must be greater than 0!");
@VisibleForTesting public void initialize(SourceState state) throws IOException { this.updateProvider = UpdateProviderFactory.create(state); this.metricContext = Instrumented.getMetricContext(state, HiveSource.class); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build(); this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state); this.workunits = Lists.newArrayList(); this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class, state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS)) .createFromState(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class, state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(), this.eventSubmitter); int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS); this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis(); this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY, DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER)); silenceHiveLoggers(); }
InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt( HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED)); if (fileSplits == null || fileSplits.length == 0) {
submitCycleCompletionEvent(); this.maxWorkUnits = state .getPropAsInt(ComplianceConfigurationKeys.MAX_WORKUNITS_KEY, ComplianceConfigurationKeys.DEFAULT_MAX_WORKUNITS); this.maxWorkUnitExecutionAttempts = state .getPropAsInt(ComplianceConfigurationKeys.MAX_WORKUNIT_EXECUTION_ATTEMPTS_KEY, ComplianceConfigurationKeys.DEFAULT_MAX_WORKUNIT_EXECUTION_ATTEMPTS);
int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT); ExecutorService threadPool = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS); List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits); addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
final Optional<CopyableFileWatermarkGenerator> watermarkGenerator = CopyableFileWatermarkHelper.getCopyableFileWatermarkGenerator(state); int maxThreads = state.getPropAsInt(MAX_CONCURRENT_LISTING_SERVICES, DEFAULT_MAX_CONCURRENT_LISTING_SERVICES);
@Override public List<WorkUnit> getWorkunits(SourceState state) { int numTasks = state.getPropAsInt(NUM_TASKS_KEY); String eventBusId = state.getProp(EVENTBUS_ID_KEY); EventBus eventBus = TestingEventBuses.getEventBus(eventBusId); Map<String, SourceState> previousStates = state.getPreviousDatasetStatesByUrns(); for (Map.Entry<String, SourceState> entry : previousStates.entrySet()) { JobState.DatasetState datasetState = (JobState.DatasetState) entry.getValue(); for (TaskState taskState : datasetState.getTaskStates()) { if (taskState.contains(Task.PERSISTENT_STATE) && eventBus != null) { eventBus.post(new Event(PREVIOUS_STATE_EVENT, taskState.getPropAsInt(Task.PERSISTENT_STATE))); } } } List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < numTasks; i++) { workUnits.add(createWorkUnit(i, eventBusId)); } return workUnits; }
String watermarkColumn = state.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY); int maxPartitions = state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS); int minTargetPartitionSize = state.getPropAsInt(MIN_TARGET_PARTITION_SIZE, DEFAULT_MIN_TARGET_PARTITION_SIZE);
@Override public WorkUnitStream getWorkunitStream(SourceState state) { int numTasks = state.getPropAsInt(NUM_TASKS_KEY); String eventBusId = state.getProp(EventBusPublishingTaskFactory.EVENTBUS_ID_KEY); EventBus eventBus = TestingEventBuses.getEventBus(eventBusId); return new BasicWorkUnitStream.Builder(new WorkUnitIterator(eventBus, eventBusId, numTasks)).build(); }
.getPropAsInt(CopySource.MAX_CONCURRENT_LISTING_SERVICES, CopySource.DEFAULT_MAX_CONCURRENT_LISTING_SERVICES);
@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < state.getPropAsInt(NUM_WORK_UNITS, 1); i++) { workUnits.add(new WorkUnit()); } return workUnits; }