@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) { return workUnits; } // Create a single snapshot-type extract for all files Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable"); String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL); for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) { // Create one work unit for each file to pull WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(SOURCE_FILE_KEY, file); workUnits.add(workUnit); } return workUnits; }
new LongWatermark(-1); prevHighWatermarks.remove(title); WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1))); workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title); workUnits.add(workUnit); WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue())); workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
private List<WorkUnit> initialWorkUnits() { List<WorkUnit> workUnits = Lists.newArrayList(); for (int i=0; i < num_parallelism; i++) { WorkUnit workUnit = WorkUnit.create(newExtract(Extract.TableType.APPEND_ONLY, namespace, table)); LongWatermark lowWatermark = new LongWatermark(i * numRecordsPerExtract + 1); LongWatermark expectedHighWatermark = new LongWatermark((i + 1) * numRecordsPerExtract); workUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedHighWatermark)); workUnit.setProp(WORK_UNIT_INDEX, i); workUnits.add(workUnit); } return workUnits; }
WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
@Override public List<WorkUnit> getWorkunits(SourceState state) { Config config = ConfigUtils.propertiesToConfig(state.getProperties()); Consumer<String, byte[]> consumer = getKafkaConsumer(config); LOG.debug("Consumer is {}", consumer); String topic = ConfigUtils.getString(config, TOPIC_WHITELIST, StringUtils.EMPTY); // TODO: fix this to use the new API when KafkaWrapper is fixed List<WorkUnit> workUnits = new ArrayList<WorkUnit>(); List<PartitionInfo> topicPartitions; topicPartitions = consumer.partitionsFor(topic); LOG.info("Partition count is {}", topicPartitions.size()); for (PartitionInfo topicPartition : topicPartitions) { Extract extract = this.createExtract(DEFAULT_TABLE_TYPE, DEFAULT_NAMESPACE_NAME, topicPartition.topic()); LOG.info("Partition info is {}", topicPartition); WorkUnit workUnit = WorkUnit.create(extract); setTopicNameInState(workUnit, topicPartition.topic()); workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, topicPartition.topic()); setPartitionId(workUnit, topicPartition.partition()); workUnits.add(workUnit); } return workUnits; }
private List<WorkUnit> initializeWorkUnits() { List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < NUM_WORK_UNITS; i++) { WorkUnit workUnit = WorkUnit.create(createExtract(Extract.TableType.SNAPSHOT_ONLY, NAMESPACE, TABLE)); workUnit.setLowWaterMark(i * NUM_RECORDS_TO_EXTRACT_PER_EXTRACTOR + 1); workUnit.setHighWaterMark((i + 1) * NUM_RECORDS_TO_EXTRACT_PER_EXTRACTOR); workUnit.setProp(WORK_UNIT_INDEX_KEY, i); workUnits.add(workUnit); } return workUnits; } }
WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) { List<WorkUnit> workUnits = Lists.newArrayList(); String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()); List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark); Collections.sort(partitions, Partitioner.ascendingComparator); // {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract String outputTableName = sourceEntity.getDestTableName(); log.info("Create extract output with table name is " + outputTableName); Extract extract = createExtract(tableType, nameSpaceName, outputTableName); // Setting current time for the full extract if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) { extract.setFullTrue(System.currentTimeMillis()); } for (Partition partition : partitions) { WorkUnit workunit = WorkUnit.create(extract); workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName()); workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName()); workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION); addLineageSourceInfo(state, sourceEntity, workunit); partition.serialize(workunit); workUnits.add(workunit); } return workUnits; }
@Test public void testGetDefaultWriterFilePath() { String namespace = "gobblin.test"; String tableName = "test-table"; SourceState sourceState = new SourceState(); WorkUnit state = WorkUnit.create(new Extract(sourceState, TableType.APPEND_ONLY, namespace, tableName)); Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path(state.getExtract().getOutputFilePath())); Assert.assertEquals(WriterUtils.getWriterFilePath(state, 2, 0), new Path(state.getExtract().getOutputFilePath(), ConfigurationKeys.DEFAULT_FORK_BRANCH_NAME + "0")); }
TaskState getEmptyTestTaskState(String taskId) { // Create a TaskState WorkUnit workUnit = WorkUnit.create( new Extract(Extract.TableType.SNAPSHOT_ONLY, this.getClass().getName(), this.getClass().getSimpleName())); workUnit.setProp(ConfigurationKeys.TASK_KEY_KEY, "taskKey"); TaskState taskState = new TaskState(new WorkUnitState(workUnit)); taskState.setProp(ConfigurationKeys.METRICS_ENABLED_KEY, Boolean.toString(false)); taskState.setTaskId(taskId); taskState.setJobId("1234"); return taskState; }
WorkUnit singleWorkUnit = WorkUnit.create(extract); singleWorkUnit.setProp(ConfigurationKeys.SOURCE_ENTITY, topicName); singleWorkUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, file.getFilePath());
TaskState getEmptyTestTaskState(String taskId) { // Create a TaskState WorkUnit workUnit = WorkUnit.create( new Extract(Extract.TableType.SNAPSHOT_ONLY, this.getClass().getName(), this.getClass().getSimpleName())); workUnit.setProp(ConfigurationKeys.TASK_KEY_KEY, "taskKey"); TaskState taskState = new TaskState(new WorkUnitState(workUnit)); taskState.setProp(ConfigurationKeys.METRICS_ENABLED_KEY, Boolean.toString(false)); taskState.setTaskId(taskId); taskState.setJobId("1234"); return taskState; }
@Test public void testGetDefaultWriterFilePathWithWorkUnitState() { String namespace = "gobblin.test"; String tableName = "test-table"; SourceState sourceState = new SourceState(); WorkUnit workUnit = WorkUnit.create(new Extract(sourceState, TableType.APPEND_ONLY, namespace, tableName)); WorkUnitState workUnitState = new WorkUnitState(workUnit); Assert.assertEquals(WriterUtils.getWriterFilePath(workUnitState, 0, 0), new Path(workUnitState.getExtract() .getOutputFilePath())); Assert.assertEquals(WriterUtils.getWriterFilePath(workUnitState, 2, 0), new Path(workUnitState.getExtract() .getOutputFilePath(), ConfigurationKeys.DEFAULT_FORK_BRANCH_NAME + "0")); }
@Test public void testGetWriterFilePath() { Extract extract = new Extract(TableType.SNAPSHOT_ONLY, "org.apache.gobblin.dbNamespace", "tableName"); WorkUnit state = WorkUnit.create(extract); state.setProp(ConfigurationKeys.WRITER_FILE_PATH, TEST_WRITER_FILE_PATH); Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), TEST_WRITER_FILE_PATH); state.setProp(ConfigurationKeys.WRITER_FILE_PATH + ".0", TEST_WRITER_FILE_PATH); Assert.assertEquals(WriterUtils.getWriterFilePath(state, 1, 1), TEST_WRITER_FILE_PATH); state.removeProp(ConfigurationKeys.WRITER_FILE_PATH); state.setProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, "tablename"); Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path("tableName")); state.setProp(ConfigurationKeys.WRITER_FILE_PATH_TYPE, "namespace_table"); Assert.assertEquals(WriterUtils.getWriterFilePath(state, 0, 0), new Path("org/apache/gobblin/dbNamespace/tableName")); }
private TaskState getStreamingTaskState() { WorkUnitState workUnitState = new WorkUnitState(WorkUnit.create( new Extract(Extract.TableType.SNAPSHOT_ONLY, this.getClass().getName(), this.getClass().getSimpleName()))); workUnitState.setProp(ConfigurationKeys.TASK_KEY_KEY, "1234"); TaskState taskState = new TaskState(workUnitState); taskState.setProp(ConfigurationKeys.METRICS_ENABLED_KEY, Boolean.toString(false)); taskState.setProp(TaskConfigurationKeys.TASK_EXECUTION_MODE, ExecutionModel.STREAMING.name()); taskState.setJobId("1234"); taskState.setTaskId("testContinuousTaskId"); return taskState; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1"); Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2"); String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY); List<String> list = SPLITTER.splitToList(sourceFileList); List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < list.size(); i++) { WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2); workUnit.setProp(SOURCE_FILE_KEY, list.get(i)); workUnits.add(workUnit); } if (state.getPropAsBoolean("use.multiworkunit", false)) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); multiWorkUnit.addWorkUnits(workUnits); workUnits.clear(); workUnits.add(multiWorkUnit); } return workUnits; }
LongWatermark expectedWatermark = new LongWatermark(watermark.getValue() + numRecordsPerExtract); WatermarkInterval watermarkInterval = new WatermarkInterval(watermark, expectedWatermark); workUnit = WorkUnit.create(newExtract(tableType, namespace, table), watermarkInterval); log.debug("Will be setting watermark interval to " + watermarkInterval.toJson()); workUnit.setProp(WORK_UNIT_INDEX, workUnitState.getWorkunit().getProp(WORK_UNIT_INDEX)); LongWatermark expectedWatermark = new LongWatermark(watermark.getValue() + numRecordsPerExtract); WatermarkInterval watermarkInterval = new WatermarkInterval(watermark, expectedWatermark); workUnit = WorkUnit.create(newExtract(tableType, namespace, table), watermarkInterval); log.debug("Will be setting watermark interval to " + watermarkInterval.toJson()); workUnit.setProp(WORK_UNIT_INDEX, workUnitState.getWorkunit().getProp(WORK_UNIT_INDEX));
SourceState sourceState = new SourceState(); WorkUnitState state = new WorkUnitState(WorkUnit.create(new Extract(sourceState, TableType.APPEND_ONLY, namespace, tableName)));
@Override public List<WorkUnit> getWorkunits(SourceState sourceState) { sourceState.setProp(FOO, BAR); if (Iterables.isEmpty(sourceState.getPreviousWorkUnitStates())) { return initializeWorkUnits(); } List<WorkUnit> workUnits = Lists.newArrayList(); for (WorkUnitState workUnitState : sourceState.getPreviousWorkUnitStates()) { WorkUnit workUnit = WorkUnit.create(createExtract(Extract.TableType.SNAPSHOT_ONLY, NAMESPACE, TABLE)); workUnit.setLowWaterMark(workUnitState.getPropAsInt(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY) + NUM_WORK_UNITS * NUM_RECORDS_TO_EXTRACT_PER_EXTRACTOR); workUnit.setHighWaterMark(workUnitState.getPropAsInt(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY) + NUM_WORK_UNITS * NUM_RECORDS_TO_EXTRACT_PER_EXTRACTOR); workUnit.setProp(WORK_UNIT_INDEX_KEY, workUnitState.getPropAsInt(WORK_UNIT_INDEX_KEY)); workUnits.add(workUnit); } return workUnits; }
WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(TOPIC_NAME, partition.getTopicName()); addDatasetUrnOptionally(workUnit);