/** * Take in an input schema of type string, the schema must be in JSON format * @return a JsonArray representation of the schema */ @Override public JsonArray convertSchema(String inputSchema, WorkUnitState workUnit) throws SchemaConversionException { this.unpackComplexSchemas = workUnit.getPropAsBoolean(UNPACK_COMPLEX_SCHEMAS_KEY, DEFAULT_UNPACK_COMPLEX_SCHEMAS_KEY); JsonParser jsonParser = new JsonParser(); log.info("Schema: " + inputSchema); JsonElement jsonSchema = jsonParser.parse(inputSchema); return jsonSchema.getAsJsonArray(); }
/** * If partition already exists then new partition location will be a separate time stamp dir * If partition location is /a/b/c/<oldTimeStamp> then new partition location is /a/b/c/<currentTimeStamp> * If partition location is /a/b/c/ then new partition location is /a/b/c/<currentTimeStamp> **/ public static String updatePartitionLocation(String outputDataPartitionLocation, WorkUnitState workUnitState, Optional<Path> destPartitionLocation) throws DataConversionException { if (workUnitState.getPropAsBoolean(HIVE_DATASET_PARTITION_OVERWRITE, DEFAULT_HIVE_DATASET_PARTITION_OVERWRITE)) { return outputDataPartitionLocation; } if (!destPartitionLocation.isPresent()) { return outputDataPartitionLocation; } long timeStamp = System.currentTimeMillis(); return StringUtils.join(Arrays.asList(outputDataPartitionLocation, timeStamp), '/'); }
public SalesforceExtractor(WorkUnitState state) { super(state); this.sfConnector = (SalesforceConnector) this.connector; // don't allow pk chunking if max partitions too high or have user specified partitions if (state.getPropAsBoolean(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, false) || state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS) > PK_CHUNKING_MAX_PARTITIONS_LIMIT) { if (state.getPropAsBoolean(ENABLE_PK_CHUNKING_KEY, false)) { log.warn("Max partitions too high, so PK chunking is not enabled"); } this.pkChunking = false; } else { this.pkChunking = state.getPropAsBoolean(ENABLE_PK_CHUNKING_KEY, false); } this.pkChunkingSize = Math.max(MIN_PK_CHUNKING_SIZE, Math.min(MAX_PK_CHUNKING_SIZE, state.getPropAsInt(PK_CHUNKING_SIZE_KEY, DEFAULT_PK_CHUNKING_SIZE))); this.pkChunkingSkipCountCheck = state.getPropAsBoolean(PK_CHUNKING_SKIP_COUNT_CHECK, DEFAULT_PK_CHUNKING_SKIP_COUNT_CHECK); this.bulkApiUseQueryAll = state.getPropAsBoolean(BULK_API_USE_QUERY_ALL, DEFAULT_BULK_API_USE_QUERY_ALL); // Get batch size from .pull file int tmpBatchSize = state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE); this.batchSize = tmpBatchSize == 0 ? ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE : tmpBatchSize; this.fetchRetryLimit = state.getPropAsInt(FETCH_RETRY_LIMIT_KEY, DEFAULT_FETCH_RETRY_LIMIT); }
public TestExtractor(WorkUnitState workUnitState) { super(workUnitState); if (workUnitState.getPropAsBoolean(RAISE_ERROR, false)) { throw new RuntimeException(EXCEPTION_MESSAGE); } }
@Override public Converter<String, String, String, List<String>> init(WorkUnitState workUnit) { String stringSplitterDelimiterKey = ForkOperatorUtils.getPropertyNameForBranch(workUnit, ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); Preconditions.checkArgument(workUnit.contains(stringSplitterDelimiterKey), "Cannot use " + this.getClass().getName() + " with out specifying " + ConfigurationKeys.CONVERTER_STRING_SPLITTER_DELIMITER); this.splitter = Splitter.on(workUnit.getProp(stringSplitterDelimiterKey)); this.shouldTrimResults = workUnit.getPropAsBoolean(ConfigurationKeys.CONVERTER_STRING_SPLITTER_SHOULD_TRIM_RESULTS, ConfigurationKeys.DEFAULT_CONVERTER_STRING_SPLITTER_SHOULD_TRIM_RESULTS); return this; }
protected void addWriterOutputToExistingDir(Path writerOutputDir, Path publisherOutputDir, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { boolean preserveFileName = workUnitState.getPropAsBoolean(ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, this.numBranches, branchId), false); // Go through each file in writerOutputDir and move it into publisherOutputDir for (FileStatus status : this.writerFileSystemByBranches.get(branchId).listStatus(writerOutputDir)) { // Preserve the file name if configured, use specified name otherwise Path finalOutputPath = preserveFileName ? new Path(publisherOutputDir, workUnitState.getProp(ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_FINAL_NAME, this.numBranches, branchId))) : new Path(publisherOutputDir, status.getPath().getName()); movePath(parallelRunner, workUnitState, status.getPath(), finalOutputPath, branchId); } }
@Override public List<Command> getSchemaMetadata(String schema, String entity) throws SchemaException { log.debug("Build query to get schema"); List<Command> commands = new ArrayList<>(); boolean promoteUnsignedInt = this.workUnitState.getPropAsBoolean( ConfigurationKeys.SOURCE_QUERYBASED_PROMOTE_UNSIGNED_INT_TO_BIGINT, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_PROMOTE_UNSIGNED_INT_TO_BIGINT); String promoteUnsignedIntQueryParam = promoteUnsignedInt ? "% unsigned" : "dummy"; List<String> queryParams = Arrays.asList(promoteUnsignedIntQueryParam, entity, schema); String metadataSql = "select " + " col.column_name, " + " case when col.column_type like (?) and col.data_type = 'int' then 'bigint' else col.data_type end" + " as data_type," + " case when CHARACTER_OCTET_LENGTH is null then 0 else 0 end as length, " + " case when NUMERIC_PRECISION is null then 0 else NUMERIC_PRECISION end as precesion, " + " case when NUMERIC_SCALE is null then 0 else NUMERIC_SCALE end as scale, " + " case when is_nullable='NO' then 'false' else 'true' end as nullable, " + " '' as format, " + " case when col.column_comment is null then '' else col.column_comment end as comment " + " from information_schema.COLUMNS col " + " WHERE upper(col.table_name)=upper(?) AND upper(col.table_schema)=upper(?) " + " order by col.ORDINAL_POSITION "; commands.add(getCommand(metadataSql, JdbcCommand.JdbcCommandType.QUERY)); commands.add(getCommand(queryParams, JdbcCommand.JdbcCommandType.QUERYPARAMS)); return commands; }
public HiveMetadataForCompactionExtractor(WorkUnitState state, FileSystem fs) throws IOException, TException, HiveException { super(state); if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { log.info("Ignoring Watermark workunit for {}", state.getProp(ConfigurationKeys.DATASET_URN_KEY)); return; } try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { Table table = client.get().getTable(this.dbName, this.tableName); String primaryKeyString = table.getParameters().get(state.getProp(COMPACTION_PRIMARY_KEY)); List<String> primaryKeyList = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(primaryKeyString); String deltaString = table.getParameters().get(state.getProp(COMPACTION_DELTA)); List<String> deltaList = Splitter.on(',').omitEmptyStrings().trimResults().splitToList(deltaString); Path dataFilesPath = new Path(table.getSd().getLocation()); compactionEntity = new MRCompactionEntity(primaryKeyList, deltaList, dataFilesPath, state.getProperties()); } }
/** * Sets metadata to indicate whether this is the first time this table or partition is being published. * @param wus to set if this is first publish for this table or partition */ public static void setIsFirstPublishMetadata(WorkUnitState wus) { if (!Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) { LongWatermark previousWatermark = wus.getWorkunit().getLowWatermark(LongWatermark.class); wus.setProp(SlaEventKeys.IS_FIRST_PUBLISH, (null == previousWatermark || previousWatermark.getValue() == 0)); } } }
@Override public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); } JobConf jobConf = new JobConf(new Configuration()); for (String key : workUnitState.getPropertyNames()) { jobConf.set(key, workUnitState.getProp(key)); } String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, jobConf); RecordReader<K, V> recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); boolean readKeys = workUnitState.getPropAsBoolean( HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY, HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); }
if (workUnitState.getPropAsBoolean(GLOBAL_WATERMARK_DATASET_KEY, false)) { maxWorkUnit = Optional.of(workUnitState); break; if (maxWorkUnit.isPresent() && !maxWorkUnit.get().getPropAsBoolean(END_OF_DATASETS_KEY, false)) { previousDatasetUrnWatermark = maxWorkUnit.get().getProp(DATASET_URN); previousPartitionUrnWatermark = maxWorkUnit.get().getProp(PARTITION_URN);
/** * Check if it's appropriate to remove data pull upper bounds in the last work unit, fetching as much data as possible * from the source. As between the time when data query was created and that was executed, there might be some * new data generated in the source. Removing the upper bounds will help us grab the new data. * * Note: It's expected that there might be some duplicate data between runs because of removing the upper bounds * * @return should remove or not */ private boolean shouldRemoveDataPullUpperBounds() { if (!this.workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_QUERYBASED_ALLOW_REMOVE_UPPER_BOUNDS, true)) { return false; } // Only consider the last work unit if (!partition.isLastPartition()) { return false; } // Don't remove if user specifies one or is recorded in previous run if (partition.getHasUserSpecifiedHighWatermark() || this.workUnitState.getProp(ConfigurationKeys.WORK_UNIT_STATE_ACTUAL_HIGH_WATER_MARK_KEY) != null) { return false; } return true; }
/** * Sets the actual high watermark by reading the expected high watermark * {@inheritDoc} * @see org.apache.gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker#setActualHighWatermark(org.apache.gobblin.configuration.WorkUnitState) */ @Override public void setActualHighWatermark(WorkUnitState wus) { if (Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(MultiKeyValueLongWatermark.class)); } else { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(LongWatermark.class)); } }
public HiveConvertExtractor(WorkUnitState state, FileSystem fs) throws IOException, TException, HiveException { super(state); if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { log.info("Ignoring Watermark workunit for {}", state.getProp(ConfigurationKeys.DATASET_URN_KEY)); return; } if (!(this.hiveDataset instanceof ConvertibleHiveDataset)) { throw new IllegalStateException("HiveConvertExtractor is only compatible with ConvertibleHiveDataset"); } ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) this.hiveDataset; try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { Table table = client.get().getTable(this.dbName, this.tableName); SchemaAwareHiveTable schemaAwareHiveTable = new SchemaAwareHiveTable(table, AvroSchemaManager.getSchemaFromUrl(this.hiveWorkUnit.getTableSchemaUrl(), fs)); SchemaAwareHivePartition schemaAwareHivePartition = null; if (this.hiveWorkUnit.getPartitionName().isPresent() && this.hiveWorkUnit.getPartitionSchemaUrl().isPresent()) { Partition partition = client.get().getPartition(this.dbName, this.tableName, this.hiveWorkUnit.getPartitionName().get()); schemaAwareHivePartition = new SchemaAwareHivePartition(table, partition, AvroSchemaManager.getSchemaFromUrl(this.hiveWorkUnit.getPartitionSchemaUrl().get(), fs)); } QueryBasedHiveConversionEntity entity = new QueryBasedHiveConversionEntity(convertibleHiveDataset, schemaAwareHiveTable, Optional.fromNullable(schemaAwareHivePartition)); this.conversionEntities.add(entity); } }
public HiveBaseExtractor(WorkUnitState state) throws IOException { if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { return; } this.hiveWorkUnit = new HiveWorkUnit(state.getWorkunit()); this.hiveDataset = hiveWorkUnit.getHiveDataset(); this.dbName = hiveDataset.getDbAndTable().getDb(); this.tableName = hiveDataset.getDbAndTable().getTable(); this.pool = HiveMetastoreClientPool.get(state.getJobState().getProperties(), Optional.fromNullable(state.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); }
@Override public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); } Configuration configuration = new Configuration(); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration); String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); TaskAttemptContext taskAttemptContext = getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID()); try { RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext); recordReader.initialize(fileSplit, taskAttemptContext); boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); } catch (InterruptedException ie) { throw new IOException(ie); } }
this.statusCount = this.workUnit.getPropAsInt(ConfigurationKeys.FILEBASED_REPORT_STATUS_ON_COUNT, ConfigurationKeys.DEFAULT_FILEBASED_REPORT_STATUS_ON_COUNT); this.shouldSkipFirstRecord = this.workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_SKIP_FIRST_RECORD, false);
public AsyncHttpJoinConverter init(WorkUnitState workUnitState) { super.init(workUnitState); Config config = ConfigBuilder.create().loadProps(workUnitState.getProperties(), CONF_PREFIX).build(); config = config.withFallback(DEFAULT_FALLBACK); skipFailedRecord = workUnitState.getPropAsBoolean(ConfigurationKeys.CONVERTER_SKIP_FAILED_RECORD, false); httpClient = createHttpClient(config, workUnitState.getTaskBroker()); responseHandler = createResponseHandler(config); requestBuilder = createRequestBuilder(config); return this; }
public SimpleJsonExtractor(WorkUnitState workUnitState) throws FileSystemException { this.workUnitState = workUnitState; // Resolve the file to pull if (workUnitState.getPropAsBoolean(ConfigurationKeys.SOURCE_CONN_USE_AUTHENTICATION, false)) { // Add authentication credential if authentication is needed UserAuthenticator auth = new StaticUserAuthenticator(workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_DOMAIN, ""), workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USERNAME), PasswordManager.getInstance(workUnitState) .readPassword(workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_PASSWORD))); FileSystemOptions opts = new FileSystemOptions(); DefaultFileSystemConfigBuilder.getInstance().setUserAuthenticator(opts, auth); this.fileObject = VFS.getManager().resolveFile(workUnitState.getProp(SOURCE_FILE_KEY), opts); } else { this.fileObject = VFS.getManager().resolveFile(workUnitState.getProp(SOURCE_FILE_KEY)); } // Open the file for reading LOGGER.info("Opening file " + this.fileObject.getURL().toString()); this.bufferedReader = this.closer.register(new BufferedReader(new InputStreamReader(this.fileObject.getContent().getInputStream(), ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); }
@Override public Schema convertSchema(JsonArray schema, WorkUnitState workUnit) throws SchemaConversionException { try { JsonSchema jsonSchema = new JsonSchema(schema); jsonSchema.setColumnName(workUnit.getExtract().getTable()); recordConverter = new RecordConverter(jsonSchema, workUnit, workUnit.getExtract().getNamespace()); } catch (UnsupportedDateTypeException e) { throw new SchemaConversionException(e); } Schema recordSchema = recordConverter.schema(); if (workUnit .getPropAsBoolean(CONVERTER_AVRO_NULLIFY_FIELDS_ENABLED, DEFAULT_CONVERTER_AVRO_NULLIFY_FIELDS_ENABLED)) { return this.generateSchemaWithNullifiedField(workUnit, recordSchema); } return recordSchema; }