@Override public int getBranches(WorkUnitState workUnitState) { return workUnitState.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1); }
public ExtractorImpl(WorkUnitState state) { this.random = new Random(); this.startTime = System.currentTimeMillis(); int runDuration = state.getPropAsInt(RUN_DURATION_KEY, DEFAULT_RUN_DURATION); // set the end time based on the configured duration if (runDuration > 0) { this.endTime = this.startTime + runDuration * 1000; } else { this.endTime = INVALID_TIME; } this.computeTimeNano = state.getPropAsInt(COMPUTE_TIME_MICRO_KEY, DEFAULT_COMPUTE_TIME_MICRO) * 1000; this.sleepTimeMicro = state.getPropAsInt(SLEEP_TIME_MICRO_KEY, DEFAULT_SLEEP_TIME); // num records only takes effect if the duration is not specified this.numRecords = this.endTime == INVALID_TIME ? state.getPropAsInt(NUM_RECORDS_KEY, DEFAULT_NUM_RECORDS) : 0; this.memAllocBytes = state.getPropAsInt(MEM_ALLOC_BYTES_KEY, DEFAULT_MEM_ALLOC_BYTES); }
@Override public int getBranches(WorkUnitState workUnitState) { return workUnitState.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY); }
/** * Get a new property key from an original one based on the branch id. The method assumes the branch id specified by * the {@link ConfigurationKeys#FORK_BRANCH_ID_KEY} parameter in the given WorkUnitState. The fork id key specifies * which fork this parameter belongs to. Note this method will only provide the aforementioned functionality for * {@link org.apache.gobblin.converter.Converter}s. To get the same functionality in {@link org.apache.gobblin.writer.DataWriter}s use * the {@link org.apache.gobblin.writer.DataWriterBuilder#forBranch(int)} to construct a writer with a specific branch id. * * @param workUnitState contains the fork id key * @param key property key * @return a new property key */ public static String getPropertyNameForBranch(WorkUnitState workUnitState, String key) { Preconditions.checkNotNull(workUnitState, "Cannot get a property from a null WorkUnit"); Preconditions.checkNotNull(key, "Cannot get a the value for a null key"); if (!workUnitState.contains(ConfigurationKeys.FORK_BRANCH_ID_KEY)) { return key; } return workUnitState.getPropAsInt(ConfigurationKeys.FORK_BRANCH_ID_KEY) >= 0 ? key + "." + workUnitState.getPropAsInt(ConfigurationKeys.FORK_BRANCH_ID_KEY) : key; }
@Override public Extractor<String, String> getExtractor(WorkUnitState state) { int helloId = state.getPropAsInt(HELLO_ID_FULL_KEY); return new ExtractorImpl(helloId); }
public SalesforceExtractor(WorkUnitState state) { super(state); this.sfConnector = (SalesforceConnector) this.connector; // don't allow pk chunking if max partitions too high or have user specified partitions if (state.getPropAsBoolean(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, false) || state.getPropAsInt(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, ConfigurationKeys.DEFAULT_MAX_NUMBER_OF_PARTITIONS) > PK_CHUNKING_MAX_PARTITIONS_LIMIT) { if (state.getPropAsBoolean(ENABLE_PK_CHUNKING_KEY, false)) { log.warn("Max partitions too high, so PK chunking is not enabled"); } this.pkChunking = false; } else { this.pkChunking = state.getPropAsBoolean(ENABLE_PK_CHUNKING_KEY, false); } this.pkChunkingSize = Math.max(MIN_PK_CHUNKING_SIZE, Math.min(MAX_PK_CHUNKING_SIZE, state.getPropAsInt(PK_CHUNKING_SIZE_KEY, DEFAULT_PK_CHUNKING_SIZE))); this.pkChunkingSkipCountCheck = state.getPropAsBoolean(PK_CHUNKING_SKIP_COUNT_CHECK, DEFAULT_PK_CHUNKING_SKIP_COUNT_CHECK); this.bulkApiUseQueryAll = state.getPropAsBoolean(BULK_API_USE_QUERY_ALL, DEFAULT_BULK_API_USE_QUERY_ALL); // Get batch size from .pull file int tmpBatchSize = state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE); this.batchSize = tmpBatchSize == 0 ? ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE : tmpBatchSize; this.fetchRetryLimit = state.getPropAsInt(FETCH_RETRY_LIMIT_KEY, DEFAULT_FETCH_RETRY_LIMIT); }
@Override public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException { log.debug("Build query to extract data"); List<Command> commands = new ArrayList<>(); int fetchSize = this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE); log.info("Setting jdbc resultset fetch size as " + fetchSize); String watermarkFilter = StringUtils.defaultIfBlank(this.concatPredicates(predicateList), EMPTY_CONDITION); String query = this.getExtractSql(); query = query.replace(ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_WATERMARK_PREDICATE_SYMBOL, watermarkFilter); query = addSampleQueryPart(query); commands.add(getCommand(query, JdbcCommand.JdbcCommandType.QUERY)); commands.add(getCommand(fetchSize, JdbcCommand.JdbcCommandType.FETCHSIZE)); return commands; }
@Override public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException { log.debug("Build query to extract data"); List<Command> commands = new ArrayList<Command>(); int fetchSize = this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE); String watermarkFilter = this.concatPredicates(predicateList); String query = this.getExtractSql(); if (isNullOrEmpty(watermarkFilter)) { watermarkFilter = "1=1"; } query = query.replace(ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_WATERMARK_PREDICATE_SYMBOL, watermarkFilter); String sampleFilter = this.constructSampleClause(); query = query + sampleFilter; commands.add(JdbcExtractor.getCommand(query, JdbcCommand.JdbcCommandType.QUERY)); commands.add(JdbcExtractor.getCommand(fetchSize, JdbcCommand.JdbcCommandType.FETCHSIZE)); return commands; }
@Override public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException { log.debug("Build query to extract data"); List<Command> commands = new ArrayList<>(); int fetchsize = this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE); String watermarkFilter = this.concatPredicates(predicateList); String query = this.getExtractSql(); if (StringUtils.isBlank(watermarkFilter)) { watermarkFilter = "1=1"; } query = query.replace(ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_WATERMARK_PREDICATE_SYMBOL, watermarkFilter); String sampleFilter = this.constructSampleClause(); query = query + sampleFilter; commands.add(getCommand(query, JdbcCommand.JdbcCommandType.QUERY)); commands.add(getCommand(fetchsize, JdbcCommand.JdbcCommandType.FETCHSIZE)); return commands; }
@Override public List<Command> getDataMetadata(String schema, String entity, WorkUnit workUnit, List<Predicate> predicateList) throws DataRecordException { log.debug("Build query to extract data"); List<Command> commands = new ArrayList<>(); int fetchSize = this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_JDBC_RESULTSET_FETCH_SIZE); log.info("Setting jdbc resultset fetch size as " + fetchSize); String watermarkFilter = this.concatPredicates(predicateList); String query = this.getExtractSql(); if (StringUtils.isBlank(watermarkFilter)) { watermarkFilter = "1=1"; } query = query.replace(ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_WATERMARK_PREDICATE_SYMBOL, watermarkFilter); String sampleFilter = this.constructSampleClause(); if (!StringUtils.isEmpty(sampleFilter)) { String columnProjection = this.getOutputColumnProjection(); String newColumnProjection = sampleFilter + " " + columnProjection; query = query.replace(columnProjection, newColumnProjection); } commands.add(JdbcExtractor.getCommand(query, JdbcCommand.JdbcCommandType.QUERY)); commands.add(JdbcExtractor.getCommand(fetchSize, JdbcCommand.JdbcCommandType.FETCHSIZE)); return commands; }
@Override public Integer readRecord(Integer reuse) throws DataRecordException, IOException { if (this.current > this.workUnitState.getPropAsInt(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY)) { return null; } this.workUnitState.setProp(LAST_READ_RECORD_KEY, this.current); return this.current++; }
Map<GoogleWebmasterFilter.Dimension, ApiDimensionFilter> filterMap, WorkUnitState wuState) { super(wuState.getPropAsInt(GoggleIngestionConfigurationKeys.SOURCE_ASYNC_ITERATOR_BLOCKING_QUEUE_SIZE, 2000), wuState.getPropAsInt(GoggleIngestionConfigurationKeys.SOURCE_ASYNC_ITERATOR_POLL_BLOCKING_TIME, 1)); _wuState = wuState; Preconditions.checkArgument(!filterMap.containsKey(GoogleWebmasterFilter.Dimension.PAGE), wuState.getPropAsInt(GoogleWebMasterSource.KEY_REQUEST_PAGE_LIMIT, GoogleWebmasterClient.API_ROW_LIMIT); Preconditions.checkArgument(PAGE_LIMIT >= 1, "Page limit must be at least 1."); wuState.getPropAsInt(GoogleWebMasterSource.KEY_REQUEST_QUERY_LIMIT, GoogleWebmasterClient.API_ROW_LIMIT); Preconditions.checkArgument(QUERY_LIMIT >= 1, "Query limit must be at least 1."); ROUND_TIME_OUT = wuState.getPropAsInt(GoogleWebMasterSource.KEY_QUERIES_TUNING_TIME_OUT, 120); Preconditions.checkArgument(ROUND_TIME_OUT > 0, "Time out must be positive."); MAX_RETRY_ROUNDS = wuState.getPropAsInt(GoogleWebMasterSource.KEY_QUERIES_TUNING_RETRIES, 40); Preconditions.checkArgument(MAX_RETRY_ROUNDS >= 0, "Retry rounds cannot be negative."); ROUND_COOL_DOWN = wuState.getPropAsInt(GoogleWebMasterSource.KEY_QUERIES_TUNING_COOL_DOWN, 250); Preconditions.checkArgument(ROUND_COOL_DOWN >= 0, "Initial cool down time cannot be negative."); BATCH_SIZE = wuState.getPropAsInt(GoogleWebMasterSource.KEY_QUERIES_TUNING_BATCH_SIZE, 2); Preconditions.checkArgument(BATCH_SIZE >= 1, "Batch size must be at least 1."); TRIE_GROUP_SIZE = wuState.getPropAsInt(GoogleWebMasterSource.KEY_QUERIES_TUNING_GROUP_SIZE, 500); Preconditions.checkArgument(TRIE_GROUP_SIZE >= 1, "Group size must be at least 1.");
+ "is not specified. Trying to get the orignal schema from previous avro files."); originalSchemaPath = WriterUtils .getDataPublisherFinalDir(workUnitState, workUnitState.getPropAsInt(ConfigurationKeys.FORK_BRANCHES_KEY, 1), workUnitState.getPropAsInt(ConfigurationKeys.FORK_BRANCH_ID_KEY, 0)).getParent();
this.lastRevisionId, this.requestedTitle)); this.maxRevisionsPulled = workUnitState.getPropAsInt(MAX_REVISION_PER_PAGE, DEFAULT_MAX_REVISIONS_PER_PAGE);
final ResultSetMetaData resultsetMetadata = resultset.getMetaData(); int batchSize = this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_FETCH_SIZE, 0); batchSize = (batchSize == 0 ? ConfigurationKeys.DEFAULT_SOURCE_FETCH_SIZE : batchSize);
@Override GoogleWebmasterExtractor createExtractor(WorkUnitState state, Map<String, Integer> columnPositionMap, List<GoogleWebmasterFilter.Dimension> requestedDimensions, List<GoogleWebmasterDataFetcher.Metric> requestedMetrics, JsonArray schemaJson) throws IOException { Preconditions.checkArgument( state.getProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE).compareToIgnoreCase("Hour") == 0); Preconditions.checkArgument(state.getPropAsInt(ConfigurationKeys.SOURCE_QUERYBASED_PARTITION_INTERVAL) == 24); Partition partition = Partition.deserialize(state.getWorkunit()); long lowWatermark = partition.getLowWatermark(); long expectedHighWatermark = partition.getHighWatermark(); /* This change is needed because 1. The partition behavior changed due to commit 7d730fcb0263b8ca820af0366818160d638d1336 [7d730fc] by zxcware <zxcware@gmail.com> on April 3, 2017 at 11:47:41 AM PDT 2. Google Search Console API only cares about Dates, and are both side inclusive. Therefore, do the following processing. */ int dateDiff = partition.isHighWatermarkInclusive() ? 1 : 0; long highWatermarkDate = DateWatermark.adjustWatermark(Long.toString(expectedHighWatermark), dateDiff); long updatedExpectedHighWatermark = TimestampWatermark.adjustWatermark(Long.toString(highWatermarkDate), -1); updatedExpectedHighWatermark = Math.max(lowWatermark, updatedExpectedHighWatermark); GoogleWebmasterClientImpl gscClient = new GoogleWebmasterClientImpl(getCredential(state), state.getProp(ConfigurationKeys.SOURCE_ENTITY)); return new GoogleWebmasterExtractor(gscClient, state, lowWatermark, updatedExpectedHighWatermark, columnPositionMap, requestedDimensions, requestedMetrics, schemaJson); }
@Override public List<WorkUnit> getWorkunits(SourceState sourceState) { sourceState.setProp(FOO, BAR); if (Iterables.isEmpty(sourceState.getPreviousWorkUnitStates())) { return initializeWorkUnits(); } List<WorkUnit> workUnits = Lists.newArrayList(); for (WorkUnitState workUnitState : sourceState.getPreviousWorkUnitStates()) { WorkUnit workUnit = WorkUnit.create(createExtract(Extract.TableType.SNAPSHOT_ONLY, NAMESPACE, TABLE)); workUnit.setLowWaterMark(workUnitState.getPropAsInt(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY) + NUM_WORK_UNITS * NUM_RECORDS_TO_EXTRACT_PER_EXTRACTOR); workUnit.setHighWaterMark(workUnitState.getPropAsInt(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY) + NUM_WORK_UNITS * NUM_RECORDS_TO_EXTRACT_PER_EXTRACTOR); workUnit.setProp(WORK_UNIT_INDEX_KEY, workUnitState.getPropAsInt(WORK_UNIT_INDEX_KEY)); workUnits.add(workUnit); } return workUnits; }
@Override public Extractor<String, Object> getExtractor(WorkUnitState state) throws IOException { Config config = ConfigFactory.parseProperties(state.getProperties()); configureIfNeeded(config); final LongWatermark lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class); final WorkUnitState workUnitState = state; final int index = state.getPropAsInt(WORK_UNIT_INDEX); final TestBatchExtractor extractor = new TestBatchExtractor(index, lowWatermark, numRecordsPerExtract, sleepTimePerRecord, workUnitState); if (!streaming) { return extractor; } else { return (Extractor) new TestStreamingExtractor(extractor); } }
/** * Return a {@link RecordStreamWithMetadata} with the appropriate modifications. * @param inputStream * @param workUnitState * @return * @throws SchemaConversionException * @implNote this processStream does not handle {@link org.apache.gobblin.stream.MetadataUpdateControlMessage}s */ @Override public RecordStreamWithMetadata<DO, SO> processStream(RecordStreamWithMetadata<DI, SI> inputStream, WorkUnitState workUnitState) throws SchemaConversionException { int maxConcurrentAsyncConversions = workUnitState.getPropAsInt(MAX_CONCURRENT_ASYNC_CONVERSIONS_KEY, DEFAULT_MAX_CONCURRENT_ASYNC_CONVERSIONS); SO outputSchema = convertSchema(inputStream.getGlobalMetadata().getSchema(), workUnitState); Flowable<StreamEntity<DO>> outputStream = inputStream.getRecordStream() .flatMapSingle(in -> { if (in instanceof ControlMessage) { getMessageHandler().handleMessage((ControlMessage) in); return Single.just((ControlMessage<DO>) in); } else if (in instanceof RecordEnvelope) { RecordEnvelope<DI> recordEnvelope = (RecordEnvelope<DI>) in; return new SingleAsync(recordEnvelope, convertRecordAsync(outputSchema, recordEnvelope.getRecord(), workUnitState)); } else { throw new IllegalStateException("Expected ControlMessage or RecordEnvelope."); } }, false, maxConcurrentAsyncConversions); return inputStream.withRecordStream(outputStream, GlobalMetadata.<SI, SO>builderWithInput(inputStream.getGlobalMetadata(), Optional.fromNullable(outputSchema)).build()); }
/** * Create JDBC source to get connection * * @return JDBCSource */ protected JdbcProvider createJdbcSource() { String driver = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_DRIVER); String userName = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USERNAME); String password = PasswordManager.getInstance(this.workUnitState) .readPassword(this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_PASSWORD)); String connectionUrl = this.getConnectionUrl(); String proxyHost = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_URL); int proxyPort = this.workUnitState.getProp(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT) != null ? this.workUnitState.getPropAsInt(ConfigurationKeys.SOURCE_CONN_USE_PROXY_PORT) : -1; if (this.jdbcSource == null || this.jdbcSource.isClosed()) { this.jdbcSource = new JdbcProvider(driver, connectionUrl, userName, password, 1, this.getTimeOut(), "DEFAULT", proxyHost, proxyPort); return this.jdbcSource; } else { return this.jdbcSource; } }