/** * Get the actual high {@link Watermark} as a {@link JsonElement}. * * @return a {@link JsonElement} representing the actual high {@link Watermark}, * or {@code null} if the actual high {@link Watermark} is not set. */ public JsonElement getActualHighWatermark() { if (!contains(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY)) { return null; } return JSON_PARSER.parse(getProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY)); }
/** * Copy WorkUnitState so that work unit also contains job state. FileBasedExtractor needs properties from job state (mostly source.* properties), * where it has been already removed when reached here. * * @param src * @return */ private WorkUnitState copyOf(WorkUnitState src) { WorkUnit copiedWorkUnit = WorkUnit.copyOf(src.getWorkunit()); copiedWorkUnit.addAllIfNotExist(src.getJobState()); WorkUnitState workUnitState = new WorkUnitState(copiedWorkUnit, src.getJobState()); workUnitState.addAll(src); return workUnitState; }
/** * Sets metadata to indicate whether this is the first time this table or partition is being published. * @param wus to set if this is first publish for this table or partition */ public static void setIsFirstPublishMetadata(WorkUnitState wus) { if (!Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) { LongWatermark previousWatermark = wus.getWorkunit().getLowWatermark(LongWatermark.class); wus.setProp(SlaEventKeys.IS_FIRST_PUBLISH, (null == previousWatermark || previousWatermark.getValue() == 0)); } } }
public ImmutableWorkUnitState(WorkUnitState workUnitState) { super(workUnitState.getWorkunit(), workUnitState.getJobState()); super.addAll(workUnitState.getSpecProperties()); }
public TaskState(WorkUnitState workUnitState) { // Since getWorkunit() returns an immutable WorkUnit object, // the WorkUnit object in this object is also immutable. super(workUnitState.getWorkunit(), workUnitState.getJobState(), workUnitState.getTaskBrokerNullable()); addAll(workUnitState); this.jobId = workUnitState.getProp(ConfigurationKeys.JOB_ID_KEY); this.taskId = workUnitState.getProp(ConfigurationKeys.TASK_ID_KEY); this.taskKey = workUnitState.getProp(ConfigurationKeys.TASK_KEY_KEY, "unknown_task_key"); this.taskAttemptId = Optional.fromNullable(workUnitState.getProp(ConfigurationKeys.TASK_ATTEMPT_ID_KEY)); this.setId(this.taskId); }
public void testConversionWithJsonTemplate() throws DataConversionException, IOException, JSONException { JsonParser parser = new JsonParser(); String expectedResourceKey = "/sobject/user/John"; String expectedJsonStr = "{ \"name\" : \"John\", \"favoriteNumber\" : 9, \"city\" : \"Mountain view\" }"; RestEntry<JsonObject> expected = new RestEntry<JsonObject>(expectedResourceKey, parser.parse(expectedJsonStr).getAsJsonObject()); WorkUnitState workUnitState = new WorkUnitState(); workUnitState.setProp(AvroToRestJsonEntryConverter.CONVERTER_AVRO_REST_ENTRY_RESOURCE_KEY, "/sobject/user/${name}"); String template = "name=${name},favoriteNumber=${favorite_number},city=${address.city}"; workUnitState.setProp(AvroToRestJsonEntryConverter.CONVERTER_AVRO_REST_JSON_ENTRY_TEMPLATE, template); testConversion(expected, workUnitState); }
@Override public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); } JobConf jobConf = new JobConf(new Configuration()); for (String key : workUnitState.getPropertyNames()) { jobConf.set(key, workUnitState.getProp(key)); } String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, jobConf); RecordReader<K, V> recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); boolean readKeys = workUnitState.getPropAsBoolean( HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY, HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); }
@Override public Converter<String, String, String, List<String>> init(WorkUnitState workUnit) { String stringSplitterDelimiterKey = ForkOperatorUtils.getPropertyNameForBranch(workUnit, ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); Preconditions.checkArgument(workUnit.contains(stringSplitterDelimiterKey), "Cannot use " + this.getClass().getName() + " with out specifying " + ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); this.splitter = Splitter.on(workUnit.getProp(stringSplitterDelimiterKey)); this.shouldTrimResults = workUnit.getPropAsBoolean(ConfigurationKeys.CONVERTER_STRING_SPLITTER_SHOULD_TRIM_RESULTS, ConfigurationKeys.DEFAULT_CONVERTER_STRING_SPLITTER_SHOULD_TRIM_RESULTS); return this; }
private String readProp(String key, WorkUnitState workUnitState) { String value = workUnitState.getWorkunit().getProp(key); if (StringUtils.isBlank(value)) { value = workUnitState.getProp(key); } if (StringUtils.isBlank(value)) { value = workUnitState.getJobState().getProp(key); } return value; }
@Test public void testAppendsMetadataWithNormalRecord() throws IOException { state = new WorkUnitState(); dummyWriter = new MetadataDummyWriter(); writer = new MetadataWriterWrapper<>(dummyWriter, byte[].class, 1, 0, state.getJobState()); byte[] recordBytes = new byte[]{'a', 'b', 'c', 'd'}; writer.write(recordBytes); writer.commit(); String writerMetadata = state.getProp(ConfigurationKeys.WRITER_METADATA_KEY); Assert.assertNotNull(writerMetadata, "Expected there to be metadata"); Assert.assertNotEquals(writerMetadata.indexOf("\"default-encoding\""), -1, "Expected to find default metadata in metadata"); }
public HiveMetadataForCompactionExtractor(WorkUnitState state, FileSystem fs) throws IOException, TException, HiveException { super(state); if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { log.info("Ignoring Watermark workunit for {}", state.getProp(ConfigurationKeys.DATASET_URN_KEY)); return; } try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { Table table = client.get().getTable(this.dbName, this.tableName); String primaryKeyString = table.getParameters().get(state.getProp(COMPACTION_PRIMARY_KEY)); List<String> primaryKeyList = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(primaryKeyString); String deltaString = table.getParameters().get(state.getProp(COMPACTION_DELTA)); List<String> deltaList = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(deltaString); Path dataFilesPath = new Path(table.getSd().getLocation()); compactionEntity = new MRCompactionEntity(primaryKeyList, deltaList, dataFilesPath, state.getProperties()); } }
@Override public String getSchema() throws IOException { // Source is responsible to set SOURCE_SCHEMA ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); IOUtils.copyBytes(fs.open( new Path(workUnitState.getProp(ConfigurationKeys.SOURCE_SCHEMA))), outputStream, 4096, false); String schema = new String(outputStream.toByteArray(), StandardCharsets.UTF_8); workUnitState.setProp((ConfigurationKeys.CONVERTER_AVRO_SCHEMA_KEY), schema); return schema; }
@Test public void testConvertRecord() throws DataConversionException { TextToStringConverter textToStringConverter = (TextToStringConverter) new TextToStringConverter().init(new WorkUnitState()); Text text = new Text("test"); Iterator<String> iterator = textToStringConverter.convertRecord(null, text, new WorkUnitState()).iterator(); Assert.assertTrue(iterator.hasNext()); String textString = iterator.next(); Assert.assertEquals(textString, text.toString()); Assert.assertFalse(iterator.hasNext()); } }
/** * Get iterator from protocol specific api if is.specific.api.active is false * Get iterator from source specific api if is.specific.api.active is true * @return iterator */ private Iterator<D> getIterator() throws DataRecordException, IOException { if (Boolean.valueOf(this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_SPECIFIC_API_ACTIVE))) { return this.getRecordSetFromSourceApi(this.schema, this.entity, this.workUnit, this.predicateList); } return this.getRecordSet(this.schema, this.entity, this.workUnit, this.predicateList); }
public QueryBasedExtractor(WorkUnitState workUnitState) { this.workUnitState = workUnitState; this.workUnit = this.workUnitState.getWorkunit(); this.schema = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA); this.entity = this.workUnitState.getProp(ConfigurationKeys.SOURCE_ENTITY); partition = Partition.deserialize(workUnit); MDC.put("tableName", getWorkUnitName()); }
/** * Sets the actual high watermark by reading the expected high watermark * {@inheritDoc} * @see org.apache.gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker#setActualHighWatermark(org.apache.gobblin.configuration.WorkUnitState) */ @Override public void setActualHighWatermark(WorkUnitState wus) { if (Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(MultiKeyValueLongWatermark.class)); } else { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(LongWatermark.class)); } }
@Override public Extractor<String, Object> getExtractor(WorkUnitState state) throws IOException { Config config = ConfigFactory.parseProperties(state.getProperties()); configureIfNeeded(config); final LongWatermark lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class); final WorkUnitState workUnitState = state; final int index = state.getPropAsInt(WORK_UNIT_INDEX); final TestBatchExtractor extractor = new TestBatchExtractor(index, lowWatermark, numRecordsPerExtract, sleepTimePerRecord, workUnitState); if (!streaming) { return extractor; } else { return (Extractor) new TestStreamingExtractor(extractor); } }
@Test public void testWatermarkWorkUnitStateSerialization() { long actualHighWatermarkValue = 50; TestWatermark actualHighWatermark = new TestWatermark(); actualHighWatermark.setLongWatermark(actualHighWatermarkValue); WorkUnitState workUnitState = new WorkUnitState(); workUnitState.setActualHighWatermark(actualHighWatermark); TestWatermark deserializedActualHighWatermark = WatermarkSerializerHelper.convertJsonToWatermark(workUnitState.getActualHighWatermark(), TestWatermark.class); Assert.assertEquals(deserializedActualHighWatermark.getLongWatermark(), actualHighWatermarkValue); } }
@BeforeMethod public void setUp() { state = new WorkUnitState(); dummyWriter = new DummyWriter(); writer = new MetadataWriterWrapper<>(dummyWriter, byte[].class, 1, 0, state.getJobState()); }
protected void addWriterOutputToExistingDir(Path writerOutputDir, Path publisherOutputDir, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { boolean preserveFileName = workUnitState.getPropAsBoolean(ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, this.numBranches, branchId), false); // Go through each file in writerOutputDir and move it into publisherOutputDir for (FileStatus status : this.writerFileSystemByBranches.get(branchId).listStatus(writerOutputDir)) { // Preserve the file name if configured, use specified name otherwise Path finalOutputPath = preserveFileName ? new Path(publisherOutputDir, workUnitState.getProp(ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_NAME, this.numBranches, branchId))) : new Path(publisherOutputDir, status.getPath().getName()); movePath(parallelRunner, workUnitState, status.getPath(), finalOutputPath, branchId); } }