@Override public boolean apply(@Nonnull WorkUnitState input) { return input.contains(IS_WATERMARK_WORKUNIT_KEY); } };
/** * Get the actual high {@link Watermark} as a {@link JsonElement}. * * @return a {@link JsonElement} representing the actual high {@link Watermark}, * or {@code null} if the actual high {@link Watermark} is not set. */ public JsonElement getActualHighWatermark() { if (!contains(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY)) { return null; } return JSON_PARSER.parse(getProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY)); }
/** * Ignore input schema and parse in Avro schema from config */ @Override public Schema convertSchema(SI inputSchema, WorkUnitState workUnit) throws SchemaConversionException { Preconditions.checkArgument(workUnit.contains(ConfigurationKeys.CONVERTER_AVRO_SCHEMA_KEY)); this.schema = new Schema.Parser().parse(workUnit.getProp(ConfigurationKeys.CONVERTER_AVRO_SCHEMA_KEY)); return this.schema; }
@Override public ObjectStoreDeleteConverter init(WorkUnitState workUnit) { Preconditions.checkArgument(workUnit.contains(OBJECT_ID_FIELD), String.format("%s is a required property. ", OBJECT_ID_FIELD)); this.objectIdField = workUnit.getProp(OBJECT_ID_FIELD); return this; }
@Override protected Schema getLatestSchemaByTopic(String topic) { String key = STATIC_SCHEMA_ROOT_KEY + "." + topic; Preconditions.checkArgument(this.workUnitState.contains(key), String.format("Could not find schema for topic %s. Looking for key %s.", topic, key)); return new Schema.Parser().parse(this.workUnitState.getProp(key)); }
@Override public Converter<Class<String>, Class<String>, String, String> init(WorkUnitState workUnit) { String stringSplitterDelimiterKey = ForkOperatorUtils.getPropertyNameForBranch( workUnit, ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); Preconditions.checkArgument(workUnit.contains(stringSplitterDelimiterKey), "Cannot use " + this.getClass().getName() + " with out specifying " + ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); this.splitter = Splitter.on(workUnit.getProp(stringSplitterDelimiterKey)).omitEmptyStrings(); return this; }
@Override public Converter<Schema, Schema, GenericRecord, Object> init(WorkUnitState workUnit) { String fieldPathKey = ForkOperatorUtils.getPropertyNameForBranch(workUnit, ConfigurationKeys.CONVERTER_AVRO_EXTRACTOR_FIELD_PATH); Preconditions.checkArgument(workUnit.contains(fieldPathKey), "The converter " + this.getClass().getName() + " cannot be used without setting the property " + ConfigurationKeys.CONVERTER_AVRO_EXTRACTOR_FIELD_PATH); this.fieldLocation = workUnit.getProp(fieldPathKey); return this; }
@Override public Converter<String, String, Object, RawJsonDocument> init(WorkUnitState workUnit) { String keyFieldPath = ForkOperatorUtils.getPropertyNameForBranch(workUnit, KEY_FIELD_CONFIG); if (!workUnit.contains(keyFieldPath)) { log.warn("No configuration for which field to use as the key. Using the default {}", this.keyField); } else { this.keyField = workUnit.getProp(keyFieldPath); log.info("Using the field {} from config for writing converter", this.keyField); } return this; }
public KafkaAvroExtractor(WorkUnitState state) { super(state); this.schemaRegistry = state.contains(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS) ? Optional.of(KafkaSchemaRegistry.<K, Schema> get(state.getProperties())) : Optional.<KafkaSchemaRegistry<K, Schema>> absent(); this.schema = getExtractorSchema(); if (this.schema.isPresent()) { this.reader = Optional.of(new GenericDatumReader<Record>(this.schema.get())); } else { log.error(String.format("Cannot find latest schema for topic %s. This topic will be skipped", this.topicName)); this.reader = Optional.absent(); } }
@Override public Converter<String, String, String, List<String>> init(WorkUnitState workUnit) { String stringSplitterDelimiterKey = ForkOperatorUtils.getPropertyNameForBranch(workUnit, ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); Preconditions.checkArgument(workUnit.contains(stringSplitterDelimiterKey), "Cannot use " + this.getClass().getName() + " with out specifying " + ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); this.splitter = Splitter.on(workUnit.getProp(stringSplitterDelimiterKey)); this.shouldTrimResults = workUnit.getPropAsBoolean(ConfigurationKeys.CONVERTER_STRING_SPLITTER_SHOULD_TRIM_RESULTS, ConfigurationKeys.DEFAULT_CONVERTER_STRING_SPLITTER_SHOULD_TRIM_RESULTS); return this; }
@Override public Converter<SI, String, DI, DI> init(WorkUnitState workUnit) { super.init(workUnit); Preconditions.checkArgument(workUnit.contains(SCHEMA_KEY)); this.schema = workUnit.getProp(SCHEMA_KEY); return this; }
/** * Update water mark column property if there is an alias defined in query * * @param srcColumnName source column name * @param tgtColumnName target column name */ private void updateDeltaFieldConfig(String srcColumnName, String tgtColumnName) { if (this.workUnitState.contains(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY)) { String watermarkCol = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY); this.workUnitState.setProp(ConfigurationKeys.EXTRACT_DELTA_FIELDS_KEY, watermarkCol.replaceAll(srcColumnName, tgtColumnName)); } }
/** * Update primary key column property if there is an alias defined in query * * @param srcColumnName source column name * @param tgtColumnName target column name */ private void updatePrimaryKeyConfig(String srcColumnName, String tgtColumnName) { if (this.workUnitState.contains(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY)) { String primarykey = this.workUnitState.getProp(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY); this.workUnitState.setProp(ConfigurationKeys.EXTRACT_PRIMARY_KEY_FIELDS_KEY, primarykey.replaceAll(srcColumnName, tgtColumnName)); } }
@Override public Schema convertSchema(S schemaIn, WorkUnitState workUnit) throws SchemaConversionException { Preconditions.checkArgument(workUnit.contains(KafkaSource.TOPIC_NAME), "Must specify topic name."); String topic = workUnit.getProp(KafkaSource.TOPIC_NAME); try { return (Schema) this.schemaRegistry.getLatestSchema(topic); } catch (IOException | SchemaRegistryException e) { throw new SchemaConversionException(e); } }
@Override public Extractor<S, D> getExtractor(WorkUnitState state) throws IOException { Preconditions.checkArgument(state.contains(EXTRACTOR_TYPE), "Missing key " + EXTRACTOR_TYPE); try { ClassAliasResolver<KafkaExtractor> aliasResolver = new ClassAliasResolver<>(KafkaExtractor.class); Class<? extends KafkaExtractor> klazz = aliasResolver.resolveClass(state.getProp(EXTRACTOR_TYPE)); return GobblinConstructorUtils.invokeLongestConstructor(klazz, state); } catch (ReflectiveOperationException e) { throw new RuntimeException(e); } } }
/** * Get a property as long from a work unit that may or may not be a multiworkunit. * This method is needed because the SingleLevelWorkUnitPacker does not squeeze work units * into a multiworkunit, and thus does not append the partitionId to property keys, while * the BiLevelWorkUnitPacker does. * Return 0 as default if key not found in either form. */ public static long getPropAsLongFromSingleOrMultiWorkUnitState(WorkUnitState workUnitState, String key, int partitionId) { return Long.parseLong(workUnitState.contains(key) ? workUnitState.getProp(key) : workUnitState.getProp(KafkaUtils.getPartitionPropName(key, partitionId), "0")); } }
@Override public Extractor<String, String> getExtractor(WorkUnitState state) throws IOException { if (!state.contains(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS)) { state.setProp(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS, TokenizedFileDownloader.class.getName()); } return new FileBasedExtractor<>(state, new HadoopFsHelper(state)); }
public KafkaSimpleStreamingExtractor(WorkUnitState state) { super(state); _consumer = KafkaSimpleStreamingSource.getKafkaConsumer(ConfigUtils.propertiesToConfig(state.getProperties())); closer.register(_consumer); _partition = new TopicPartition(KafkaSimpleStreamingSource.getTopicNameFromState(state), KafkaSimpleStreamingSource.getPartitionIdFromState(state)); _consumer.assign(Collections.singletonList(_partition)); this._schemaRegistry = state.contains(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS) ? Optional .of(KafkaSchemaRegistry.<String, S>get(state.getProperties())) : Optional.<KafkaSchemaRegistry<String, S>>absent(); this.fetchTimeOut = state.getPropAsLong(AbstractBaseKafkaConsumerClient.CONFIG_KAFKA_FETCH_TIMEOUT_VALUE, AbstractBaseKafkaConsumerClient.CONFIG_KAFKA_FETCH_TIMEOUT_VALUE_DEFAULT); }
@Override public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); } JobConf jobConf = new JobConf(new Configuration()); for (String key : workUnitState.getPropertyNames()) { jobConf.set(key, workUnitState.getProp(key)); } String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, jobConf); RecordReader<K, V> recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); boolean readKeys = workUnitState.getPropAsBoolean( HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY, HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); }
/** * Test lineage info is set on publishing single task */ @Test public void testPublishSingleTask() throws IOException { WorkUnitState state = buildTaskState(1); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); publisher.publishData(state); Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertFalse(state.contains("gobblin.event.lineage.branch.1.destination")); }