@Override public Optional<WatermarkInterval> generateWatermarkIntervalForCopyableFile(CopyableFile copyableFile) throws IOException { StringWatermark stringWatermark = new StringWatermark(copyableFile.getFileStatus().getPath().toString()); return Optional.of(new WatermarkInterval(stringWatermark, stringWatermark)); }
WatermarkInterval watermarkInterval = new WatermarkInterval(watermark, expectedWatermark); workUnit = WorkUnit.create(newExtract(tableType, namespace, table), watermarkInterval); log.debug("Will be setting watermark interval to " + watermarkInterval.toJson()); workUnit.setProp(WORK_UNIT_INDEX, workUnitState.getWorkunit().getProp(WORK_UNIT_INDEX)); WatermarkInterval watermarkInterval = new WatermarkInterval(watermark, expectedWatermark); workUnit = WorkUnit.create(newExtract(tableType, namespace, table), watermarkInterval); log.debug("Will be setting watermark interval to " + watermarkInterval.toJson()); workUnit.setProp(WORK_UNIT_INDEX, workUnitState.getWorkunit().getProp(WORK_UNIT_INDEX));
/** * Set {@link WatermarkInterval} for a {@link WorkUnit}. */ public void setWatermarkInterval(WatermarkInterval watermarkInterval) { setProp(ConfigurationKeys.WATERMARK_INTERVAL_VALUE_KEY, watermarkInterval.toJson().toString()); }
/** * Constructor for a {@link WorkUnit} given a {@link SourceState}, {@link Extract}, and a {@link WatermarkInterval}. * * @param state a {@link org.apache.gobblin.configuration.SourceState} the properties of which will be copied into this {@link WorkUnit} instance. * @param extract an {@link Extract}. * @param watermarkInterval a {@link WatermarkInterval} which defines the range of data this {@link WorkUnit} will process. * * @deprecated Properties in {@link SourceState} should not be added to a {@link WorkUnit}. Having each * {@link WorkUnit} contain a copy of {@link SourceState} is a waste of memory. Use {@link #create(Extract, WatermarkInterval)}. */ @Deprecated public WorkUnit(SourceState state, Extract extract, WatermarkInterval watermarkInterval) { this(state, extract); /** * TODO * * Hack that stores a {@link WatermarkInterval} by using its {@link WatermarkInterval#toJson()} method. Until a * state-store migration, or a new state-store format is chosen, this hack will be the way that the * {@link WatermarkInterval} is serialized / de-serialized. Once a state-store migration can be done, the * {@link Watermark} can be stored as Binary JSON. */ setProp(ConfigurationKeys.WATERMARK_INTERVAL_VALUE_KEY, watermarkInterval.toJson().toString()); }
@SuppressWarnings("deprecation") protected static WatermarkInterval getWatermarkIntervalFromMultiWorkUnit(MultiWorkUnit multiWorkUnit) { List<Long> lowWatermarkValues = Lists.newArrayList(); List<Long> expectedHighWatermarkValues = Lists.newArrayList(); for (WorkUnit workUnit : multiWorkUnit.getWorkUnits()) { lowWatermarkValues.add(workUnit.getLowWaterMark()); expectedHighWatermarkValues.add(workUnit.getHighWaterMark()); } return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues)); }
WatermarkInterval watermarkInterval = new WatermarkInterval(watermark, expectedWatermark); workUnit = WorkUnit.create(newExtract(tableType, namespace, table), watermarkInterval); log.debug("Will be setting watermark interval to " + watermarkInterval.toJson()); workUnit.setProp(WORK_UNIT_INDEX, workUnitState.getWorkunit().getProp(WORK_UNIT_INDEX)); WatermarkInterval watermarkInterval = new WatermarkInterval(watermark, expectedWatermark); workUnit = WorkUnit.create(newExtract(tableType, namespace, table), watermarkInterval); log.debug("Will be setting watermark interval to " + watermarkInterval.toJson()); workUnit.setProp(WORK_UNIT_INDEX, workUnitState.getWorkunit().getProp(WORK_UNIT_INDEX));
/** * Set {@link WatermarkInterval} for a {@link WorkUnit}. */ public void setWatermarkInterval(WatermarkInterval watermarkInterval) { setProp(ConfigurationKeys.WATERMARK_INTERVAL_VALUE_KEY, watermarkInterval.toJson().toString()); }
new LongWatermark(-1); prevHighWatermarks.remove(title); WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1))); workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title); workUnits.add(workUnit); WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue())); workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
/** * Constructor for a {@link WorkUnit} given a {@link SourceState}, {@link Extract}, and a {@link WatermarkInterval}. * * @param state a {@link org.apache.gobblin.configuration.SourceState} the properties of which will be copied into this {@link WorkUnit} instance. * @param extract an {@link Extract}. * @param watermarkInterval a {@link WatermarkInterval} which defines the range of data this {@link WorkUnit} will process. * * @deprecated Properties in {@link SourceState} should not be added to a {@link WorkUnit}. Having each * {@link WorkUnit} contain a copy of {@link SourceState} is a waste of memory. Use {@link #create(Extract, WatermarkInterval)}. */ @Deprecated public WorkUnit(SourceState state, Extract extract, WatermarkInterval watermarkInterval) { this(state, extract); /** * TODO * * Hack that stores a {@link WatermarkInterval} by using its {@link WatermarkInterval#toJson()} method. Until a * state-store migration, or a new state-store format is chosen, this hack will be the way that the * {@link WatermarkInterval} is serialized / de-serialized. Once a state-store migration can be done, the * {@link Watermark} can be stored as Binary JSON. */ setProp(ConfigurationKeys.WATERMARK_INTERVAL_VALUE_KEY, watermarkInterval.toJson().toString()); }
@SuppressWarnings("deprecation") protected static WatermarkInterval getWatermarkIntervalFromWorkUnit(WorkUnit workUnit) { if (workUnit instanceof MultiWorkUnit) { return getWatermarkIntervalFromMultiWorkUnit((MultiWorkUnit) workUnit); } List<Long> lowWatermarkValues = Lists.newArrayList(workUnit.getLowWaterMark()); List<Long> expectedHighWatermarkValues = Lists.newArrayList(workUnit.getHighWaterMark()); return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues)); }
private List<WorkUnit> initialWorkUnits() { List<WorkUnit> workUnits = Lists.newArrayList(); for (int i=0; i < num_parallelism; i++) { WorkUnit workUnit = WorkUnit.create(newExtract(Extract.TableType.APPEND_ONLY, namespace, table)); LongWatermark lowWatermark = new LongWatermark(i * numRecordsPerExtract + 1); LongWatermark expectedHighWatermark = new LongWatermark((i + 1) * numRecordsPerExtract); workUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedHighWatermark)); workUnit.setProp(WORK_UNIT_INDEX, i); workUnits.add(workUnit); } return workUnits; }
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
public void serialize(WorkUnit workUnit) { workUnit.setWatermarkInterval( new WatermarkInterval(new LongWatermark(lowWatermark), new LongWatermark(highWatermark))); if (hasUserSpecifiedHighWatermark) { workUnit.setProp(Partition.HAS_USER_SPECIFIED_HIGH_WATERMARK, true); } if (isLastPartition) { workUnit.setProp(Partition.IS_LAST_PARTIITON, true); } }
watermarkWorkunit.setProp(ConfigurationKeys.DATASET_URN_KEY, tableKey); watermarkWorkunit.setWatermarkInterval(new WatermarkInterval(new MultiKeyValueLongWatermark( this.previousWatermarks.get(tableKey)), new MultiKeyValueLongWatermark(expectedPartitionWatermarks)));
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
@Test public void testWatermarkWorkUnitSerialization() { long lowWatermarkValue = 0; long expectedHighWatermarkValue = 100; TestWatermark lowWatermark = new TestWatermark(); lowWatermark.setLongWatermark(lowWatermarkValue); TestWatermark expectedHighWatermark = new TestWatermark(); expectedHighWatermark.setLongWatermark(expectedHighWatermarkValue); WatermarkInterval watermarkInterval = new WatermarkInterval(lowWatermark, expectedHighWatermark); WorkUnit workUnit = new WorkUnit(null, null, watermarkInterval); TestWatermark deserializedLowWatermark = WatermarkSerializerHelper.convertJsonToWatermark(workUnit.getLowWatermark(), TestWatermark.class); TestWatermark deserializedExpectedHighWatermark = WatermarkSerializerHelper.convertJsonToWatermark(workUnit.getExpectedHighWatermark(), TestWatermark.class); Assert.assertEquals(deserializedLowWatermark.getLongWatermark(), lowWatermarkValue); Assert.assertEquals(deserializedExpectedHighWatermark.getLongWatermark(), expectedHighWatermarkValue); }
@Override protected WorkUnit workUnitForDataset(Dataset dataset) { WorkUnit workUnit = new WorkUnit(); if(isDatasetStateStoreEnabled) { JobState.DatasetState datasetState = null; try { datasetState = (JobState.DatasetState) this.fsDatasetStateStore.getLatestDatasetState(this.jobName, dataset.getUrn()); } catch (IOException e) { throw new RuntimeException(e); } LongWatermark previousWatermark; if(datasetState != null) { previousWatermark = datasetState.getTaskStatesAsWorkUnitStates().get(0).getActualHighWatermark(LongWatermark.class); } else { previousWatermark = new LongWatermark(0); } workUnit.setWatermarkInterval(new WatermarkInterval(previousWatermark, new LongWatermark(LAST_PROCESSED_TS))); } return workUnit; }
@Override protected WorkUnit workUnitForDatasetPartition(PartitionableDataset.DatasetPartition partition) { WorkUnit workUnit = new WorkUnit(); if(isDatasetStateStoreEnabled) { String datasetUrn = partition.getDataset().getUrn()+"@"+partition.getUrn(); JobState.DatasetState datasetState = null; try { datasetState = (JobState.DatasetState) this.fsDatasetStateStore.getLatestDatasetState(this.jobName, datasetUrn); } catch (IOException e) { throw new RuntimeException(e); } LongWatermark previousWatermark; if(datasetState != null) { previousWatermark = datasetState.getTaskStatesAsWorkUnitStates().get(0).getActualHighWatermark(LongWatermark.class); } else { previousWatermark = new LongWatermark(0); } workUnit.setWatermarkInterval(new WatermarkInterval(previousWatermark, new LongWatermark(LAST_PROCESSED_TS))); } return workUnit; }
public static WorkUnitState getWorkUnitState1() { WorkUnit wu = new WorkUnit(new Extract(Extract.TableType.APPEND_ONLY, "namespace", "table")); wu.setWatermarkInterval( new WatermarkInterval(new LongWatermark(20160101235959L), new LongWatermark(20160102235959L))); State js = new State(); return new WorkUnitState(wu, js); } }
private WorkUnitState getMockWorkUnitState(Long lowWaterMark, Long highWaterMark) { WorkUnit mockWorkUnit = WorkUnit.createEmpty(); mockWorkUnit.setWatermarkInterval(new WatermarkInterval(new MultiLongWatermark(new ArrayList<Long>(){{add(lowWaterMark);}}), new MultiLongWatermark(new ArrayList<Long>(){{add(highWaterMark);}}))); WorkUnitState mockWorkUnitState = new WorkUnitState(mockWorkUnit, new State()); mockWorkUnitState.setProp(KafkaSource.TOPIC_NAME, TEST_TOPIC_NAME); mockWorkUnitState.setProp(KafkaSource.PARTITION_ID, "1"); mockWorkUnitState.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:8080"); mockWorkUnitState.setProp(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL, TEST_URL); return mockWorkUnitState; }