public static boolean shouldSetLineageInfo(WorkUnitState workUnitState) { return shouldSetLineageInfo(workUnitState.getWorkunit()); }
public String apply(@Nonnull WorkUnitState wus) { return new HiveWorkUnit(wus.getWorkunit()).getPartitionName().orNull(); } });
@VisibleForTesting public static void setDestLineageInfo(WorkUnitState wus, Optional<LineageInfo> lineageInfo) { HiveWorkUnit hiveWorkUnit = new HiveWorkUnit(wus.getWorkunit()); ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) hiveWorkUnit.getHiveDataset(); List<DatasetDescriptor> destDatasets = convertibleHiveDataset.getDestDatasets(); for (int i = 0; i < destDatasets.size(); i++) { if (lineageInfo.isPresent()) { lineageInfo.get().putDestination(destDatasets.get(i), i + 1, wus); } } }
/** * Get a list of previous {@link WorkUnit}s subject for retries. * * <p> * This method uses {@link AbstractSource#getPreviousWorkUnitStatesForRetry(SourceState)}. * </p> * * @param state Source state * @return list of previous {@link WorkUnit}s subject for retries */ protected List<WorkUnit> getPreviousWorkUnitsForRetry(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); for (WorkUnitState workUnitState : getPreviousWorkUnitStatesForRetry(state)) { // Make a copy here as getWorkUnit() below returns an ImmutableWorkUnit workUnits.add(WorkUnit.copyOf(workUnitState.getWorkunit())); } return workUnits; }
@Override public void setActualHighWatermark(WorkUnitState wus) { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(LongWatermark.class)); }
/** * Sets the actual high watermark by reading the expected high watermark * {@inheritDoc} * @see org.apache.gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker#setActualHighWatermark(org.apache.gobblin.configuration.WorkUnitState) */ @Override public void setActualHighWatermark(WorkUnitState wus) { if (Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(MultiKeyValueLongWatermark.class)); } else { wus.setActualHighWatermark(wus.getWorkunit().getExpectedHighWatermark(LongWatermark.class)); } }
private String readProp(String key, WorkUnitState workUnitState) { String value = workUnitState.getWorkunit().getProp(key); if (StringUtils.isBlank(value)) { value = workUnitState.getProp(key); } if (StringUtils.isBlank(value)) { value = workUnitState.getJobState().getProp(key); } return value; }
/** * Sets metadata to indicate whether this is the first time this table or partition is being published. * @param wus to set if this is first publish for this table or partition */ public static void setIsFirstPublishMetadata(WorkUnitState wus) { if (!Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) { LongWatermark previousWatermark = wus.getWorkunit().getLowWatermark(LongWatermark.class); wus.setProp(SlaEventKeys.IS_FIRST_PUBLISH, (null == previousWatermark || previousWatermark.getValue() == 0)); } } }
public QueryBasedExtractor(WorkUnitState workUnitState) { this.workUnitState = workUnitState; this.workUnit = this.workUnitState.getWorkunit(); this.schema = this.workUnitState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA); this.entity = this.workUnitState.getProp(ConfigurationKeys.SOURCE_ENTITY); partition = Partition.deserialize(workUnit); MDC.put("tableName", getWorkUnitName()); }
public ImmutableWorkUnitState(WorkUnitState workUnitState) { super(workUnitState.getWorkunit(), workUnitState.getJobState()); super.addAll(workUnitState.getSpecProperties()); }
@Override public Extractor<String, String[]> getExtractor(WorkUnitState state) throws IOException { List<GoogleWebmasterFilter.Dimension> requestedDimensions = getRequestedDimensions(state); List<GoogleWebmasterDataFetcher.Metric> requestedMetrics = getRequestedMetrics(state); WorkUnit workunit = state.getWorkunit(); String schema = workunit.getProp(ConfigurationKeys.SOURCE_SCHEMA); JsonArray schemaJson = new JsonParser().parse(schema).getAsJsonArray(); Map<String, Integer> columnPositionMap = new HashMap<>(); for (int i = 0; i < schemaJson.size(); ++i) { JsonElement jsonElement = schemaJson.get(i); String columnName = jsonElement.getAsJsonObject().get("columnName").getAsString().toUpperCase(); columnPositionMap.put(columnName, i); } if (workunit.getPropAsBoolean(GoogleWebMasterSource.KEY_INCLUDE_SOURCE_PROPERTY, DEFAULT_INCLUDE_SOURCE_PROPERTY)) { String columnName = workunit.getProp(KEY_SOURCE_PROPERTY_COLUMN_NAME, DEFAULT_SOURCE_PROPERTY_COLUMN_NAME); schemaJson.add(SchemaUtil.createColumnJson(columnName, false, JsonElementConversionFactory.Type.STRING)); } validateFilters(state.getProp(GoogleWebMasterSource.KEY_REQUEST_FILTERS)); validateRequests(columnPositionMap, requestedDimensions, requestedMetrics); return createExtractor(state, columnPositionMap, requestedDimensions, requestedMetrics, schemaJson); }
public HiveTask(TaskContext taskContext) { super(taskContext); this.taskContext = taskContext; this.workUnitState = taskContext.getTaskState(); this.workUnit = new HiveWorkUnit(this.workUnitState.getWorkunit()); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, "gobblin.HiveTask") .build(); this.hiveExecutionQueries = Lists.newArrayList(); this.publishEntity = new QueryBasedHivePublishEntity(); try { this.hiveJdbcConnector = HiveJdbcConnector.newConnectorWithProps(this.workUnitState.getProperties()); } catch (SQLException se) { throw new RuntimeException("Error in creating JDBC Connector", se); } this.addFiles = this.workUnitState.getPropAsList(ADD_FILES, ""); this.addJars = this.workUnitState.getPropAsList(ADD_JARS, ""); this.setupQueries = Splitter.on(";").trimResults().omitEmptyStrings().splitToList(this.workUnitState.getProp(SETUP_QUERIES, "")); }
@Override public String toString() { return super.toString() + "\nWorkUnit: " + getWorkunit().toString() + "\nExtract: " + getExtract().toString() + "\nJobState: " + this.jobState.toString(); }
public TestExtractor(WorkUnitState workUnitState) { //super(workUnitState); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); Path sourceFile = new Path(workUnitState.getWorkunit().getProp(TestSource.SOURCE_FILE_KEY)); LOG.info("Reading from source file " + sourceFile); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); try { FileSystem fs = FileSystem .get(URI.create(workUnitState.getProp(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI)), new Configuration()); fs.makeQualified(sourceFile); this.dataFileReader = new DataFileReader<GenericRecord>(new FsInput(sourceFile, new Configuration()), datumReader); } catch (IOException ioe) { LOG.error("Failed to read the source file " + sourceFile, ioe); } }
public TaskState(WorkUnitState workUnitState) { // Since getWorkunit() returns an immutable WorkUnit object, // the WorkUnit object in this object is also immutable. super(workUnitState.getWorkunit(), workUnitState.getJobState(), workUnitState.getTaskBrokerNullable()); addAll(workUnitState); this.jobId = workUnitState.getProp(ConfigurationKeys.JOB_ID_KEY); this.taskId = workUnitState.getProp(ConfigurationKeys.TASK_ID_KEY); this.taskKey = workUnitState.getProp(ConfigurationKeys.TASK_KEY_KEY, "unknown_task_key"); this.taskAttemptId = Optional.fromNullable(workUnitState.getProp(ConfigurationKeys.TASK_ATTEMPT_ID_KEY)); this.setId(this.taskId); }
public HiveBaseExtractor(WorkUnitState state) throws IOException { if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { return; } this.hiveWorkUnit = new HiveWorkUnit(state.getWorkunit()); this.hiveDataset = hiveWorkUnit.getHiveDataset(); this.dbName = hiveDataset.getDbAndTable().getDb(); this.tableName = hiveDataset.getDbAndTable().getTable(); this.pool = HiveMetastoreClientPool.get(state.getJobState().getProperties(), Optional.fromNullable(state.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); }
/** * Copy WorkUnitState so that work unit also contains job state. FileBasedExtractor needs properties from job state (mostly source.* properties), * where it has been already removed when reached here. * * @param src * @return */ private WorkUnitState copyOf(WorkUnitState src) { WorkUnit copiedWorkUnit = WorkUnit.copyOf(src.getWorkunit()); copiedWorkUnit.addAllIfNotExist(src.getJobState()); WorkUnitState workUnitState = new WorkUnitState(copiedWorkUnit, src.getJobState()); workUnitState.addAll(src); return workUnitState; }
/** * Gets the LWM for this job runs. The new LWM is the HWM of the previous run + 1 unit (day,hour,minute..etc). * If there was no previous execution then it is set to the given lowWaterMark + 1 unit. */ private long getLowWaterMark(Iterable<WorkUnitState> previousStates, String lowWaterMark) { long lowWaterMarkValue = retriever.getWatermarkFromString(lowWaterMark); // Find the max HWM from the previous states, this is the new current LWM for (WorkUnitState previousState : previousStates) { if (previousState.getWorkingState().equals(WorkUnitState.WorkingState.COMMITTED)) { long previousHighWaterMark = previousState.getWorkunit().getHighWaterMark(); if (previousHighWaterMark > lowWaterMarkValue) { lowWaterMarkValue = previousHighWaterMark; } } } return lowWaterMarkValue + getRetriever().getWatermarkIncrementMs(); }
@Override public Extractor<String, Object> getExtractor(WorkUnitState state) throws IOException { Config config = ConfigFactory.parseProperties(state.getProperties()); configureIfNeeded(config); final LongWatermark lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class); final WorkUnitState workUnitState = state; final int index = state.getPropAsInt(WORK_UNIT_INDEX); final TestBatchExtractor extractor = new TestBatchExtractor(index, lowWatermark, numRecordsPerExtract, sleepTimePerRecord, workUnitState); if (!streaming) { return extractor; } else { return (Extractor) new TestStreamingExtractor(extractor); } }
public HiveMaterializerQueryGenerator(WorkUnitState workUnitState) throws IOException { this.fs = HiveSource.getSourceFs(workUnitState); this.pool = HiveMetastoreClientPool.get(workUnitState.getJobState().getProperties(), Optional.fromNullable(workUnitState.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); this.workUnitState = workUnitState; this.workUnit = new HiveWorkUnit(workUnitState.getWorkunit()); this.outputTableMetadata = HiveMaterializer.parseStageableTableMetadata(this.workUnit); this.outputDatabaseName = outputTableMetadata.getDestinationDbName(); this.outputTableName = outputTableMetadata.getDestinationTableName(); this.outputDataLocation = HiveConverterUtils.getOutputDataLocation(outputTableMetadata.getDestinationDataPath()); this.destinationTableMeta = HiveConverterUtils.getDestinationTableMeta(this.outputTableMetadata.getDestinationDbName(), this.outputTableMetadata.getDestinationTableName(), workUnitState.getProperties()).getLeft(); this.stagingTableName = HiveConverterUtils.getStagingTableName(this.outputTableMetadata.getDestinationStagingTableName()); this.stagingDataLocation = HiveConverterUtils.getStagingDataLocation(this.outputTableMetadata.getDestinationDataPath(), this.stagingTableName); }