protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) { List<WorkUnit> workUnits = Lists.newArrayList(); String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()); List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark); Collections.sort(partitions, Partitioner.ascendingComparator); // {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract String outputTableName = sourceEntity.getDestTableName(); log.info("Create extract output with table name is " + outputTableName); Extract extract = createExtract(tableType, nameSpaceName, outputTableName); // Setting current time for the full extract if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) { extract.setFullTrue(System.currentTimeMillis()); } for (Partition partition : partitions) { WorkUnit workunit = WorkUnit.create(extract); workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName()); workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName()); workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION); addLineageSourceInfo(state, sourceEntity, workunit); partition.serialize(workunit); workUnits.add(workunit); } return workUnits; }
static Set<SourceEntity> getSourceEntitiesHelper(State state) { if (state.contains(ConfigurationKeys.SOURCE_ENTITIES)) { log.info("Using entity names in " + ConfigurationKeys.SOURCE_ENTITIES); HashSet<SourceEntity> res = new HashSet<>(); for (String sourceEntityName: state.getPropAsList(ConfigurationKeys.SOURCE_ENTITIES)) { res.add(SourceEntity.fromSourceEntityName(sourceEntityName)); } return res; } else if (state.contains(ConfigurationKeys.SOURCE_ENTITY) || state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) { Optional<SourceEntity> sourceEntity = SourceEntity.fromState(state); // Guaranteed to be present log.info("Using entity name in " + sourceEntity.get()); return ImmutableSet.of(sourceEntity.get()); } throw new IllegalStateException(String.format("One of the following properties must be specified: %s, %s.", ConfigurationKeys.SOURCE_ENTITIES, ConfigurationKeys.SOURCE_ENTITY)); }
@Test public void testGetTableSpecificPropsFromState() { SourceState state = new SourceState(); state.setProp(DatasetUtils.DATASET_SPECIFIC_PROPS, "[{\"dataset\":\"Entity1\", \"value\": 1}, {\"dataset\":\"Table2\", \"value\":2}]"); // We should look in the dataset specific properties using the entity name, not table name SourceEntity se1 = new SourceEntity("Entity1", "Table2"); SourceEntity se3 = new SourceEntity("Entity3", "Table3"); Set<SourceEntity> entities = ImmutableSet.of(se1, se3); Map<SourceEntity, State> datasetProps = QueryBasedSource.getTableSpecificPropsFromState(entities, state); // Value 1 should be returned for se1, no prpos should be returned for se3 Assert.assertEquals(datasetProps.size(), 1); Assert.assertTrue(datasetProps.containsKey(se1)); State se1Props = datasetProps.get(se1); Assert.assertEquals(se1Props.getProp("value"), "1"); }
Optional<SourceEntity> sourceEntity = SourceEntity.fromState(previousWus); if (!sourceEntity.isPresent()) { log.warn("Missing source entity for WorkUnit state: " + previousWus);
protected List<WorkUnit> generateWorkUnits(SourceEntity sourceEntity, SourceState state, long previousWatermark) { List<WorkUnit> workUnits = Lists.newArrayList(); String nameSpaceName = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); TableType tableType = TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()); List<Partition> partitions = new Partitioner(state).getPartitionList(previousWatermark); Collections.sort(partitions, Partitioner.ascendingComparator); // {@link ConfigurationKeys.EXTRACT_TABLE_NAME_KEY} specify the output path for Extract String outputTableName = sourceEntity.getDestTableName(); log.info("Create extract output with table name is " + outputTableName); Extract extract = createExtract(tableType, nameSpaceName, outputTableName); // Setting current time for the full extract if (Boolean.valueOf(state.getProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY))) { extract.setFullTrue(System.currentTimeMillis()); } for (Partition partition : partitions) { WorkUnit workunit = WorkUnit.create(extract); workunit.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName()); workunit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName()); workunit.setProp(WORK_UNIT_STATE_VERSION_KEY, CURRENT_WORK_UNIT_STATE_VERSION); addLineageSourceInfo(state, sourceEntity, workunit); partition.serialize(workunit); workUnits.add(workunit); } return workUnits; }
static Set<SourceEntity> getSourceEntitiesHelper(State state) { if (state.contains(ConfigurationKeys.SOURCE_ENTITIES)) { log.info("Using entity names in " + ConfigurationKeys.SOURCE_ENTITIES); HashSet<SourceEntity> res = new HashSet<>(); for (String sourceEntityName: state.getPropAsList(ConfigurationKeys.SOURCE_ENTITIES)) { res.add(SourceEntity.fromSourceEntityName(sourceEntityName)); } return res; } else if (state.contains(ConfigurationKeys.SOURCE_ENTITY) || state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) { Optional<SourceEntity> sourceEntity = SourceEntity.fromState(state); // Guaranteed to be present log.info("Using entity name in " + sourceEntity.get()); return ImmutableSet.of(sourceEntity.get()); } throw new IllegalStateException(String.format("One of the following properties must be specified: %s, %s.", ConfigurationKeys.SOURCE_ENTITIES, ConfigurationKeys.SOURCE_ENTITY)); }
private static Map<SourceEntity, State> getTableSpecificPropsFromConfigStore( Collection<SourceEntity> tables, State state) { ConfigClient client = ConfigClientCache.getClient(VersionStabilityPolicy.STRONG_LOCAL_STABILITY); String configStoreUri = state.getProp(ConfigurationKeys.CONFIG_MANAGEMENT_STORE_URI); Preconditions.checkNotNull(configStoreUri); Map<SourceEntity, State> result = Maps.newHashMap(); for (SourceEntity table : tables) { try { result.put(table, ConfigUtils.configToState( client.getConfig(PathUtils.combinePaths(configStoreUri, QUERY_BASED_SOURCE, table.getDatasetName()).toUri()))); } catch (VersionDoesNotExistException | ConfigStoreFactoryDoesNotExistsException | ConfigStoreCreationException e) { throw new RuntimeException("Unable to get table config for " + table, e); } } return result; }
public static Optional<SourceEntity> fromState(State state) { String sourceEntityName; String destTableName; if (state.contains(ConfigurationKeys.SOURCE_ENTITY)) { sourceEntityName = state.getProp(ConfigurationKeys.SOURCE_ENTITY); destTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sanitizeEntityName(sourceEntityName)); } else if (state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)) { destTableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY); sourceEntityName = destTableName; } else { return Optional.absent(); } return Optional.of(new SourceEntity(sourceEntityName, destTableName)); }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; SourceEntity other = (SourceEntity) obj; if (getDatasetName() == null) { if (other.getDatasetName() != null) return false; } else if (!getDatasetName().equals(other.getDatasetName())) return false; return true; }
public static Map<SourceEntity, State> getTableSpecificPropsFromState( Iterable<SourceEntity> entities, SourceState state) { Map<String, SourceEntity> sourceEntityByName = new HashMap<>(); for (SourceEntity entity: entities) { sourceEntityByName.put(entity.getDatasetName(), entity); } Map<String, State> datasetProps = DatasetUtils.getDatasetSpecificProps(sourceEntityByName.keySet(), state); Map<SourceEntity, State> res = new HashMap<>(); for (Map.Entry<String, State> entry: datasetProps.entrySet()) { res.put(sourceEntityByName.get(entry.getKey()), entry.getValue()); } return res; }
public static SourceEntity fromSourceEntityName(String sourceEntityName) { return new SourceEntity(sourceEntityName, sanitizeEntityName(sourceEntityName)); }
protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) { String host = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_HOST_NAME); String port = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_PORT); String database = sourceState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA); String connectionUrl = "jdbc:mysql://" + host.trim() + ":" + port + "/" + database.trim(); DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_MYSQL, database + "." + entity.getSourceEntityName()); source.addMetadata(DatasetConstants.CONNECTION_URL, connectionUrl); if (lineageInfo.isPresent()) { lineageInfo.get().setSource(source, workUnit); } } }
static Set<SourceEntity> getFilteredSourceEntitiesHelper(SourceState state, Iterable<SourceEntity> unfilteredEntities) { Set<SourceEntity> entities = new HashSet<>(); List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, ENTITY_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, ENTITY_WHITELIST); for (SourceEntity entity : unfilteredEntities) { if (DatasetFilterUtils.survived(entity.getSourceEntityName(), blacklist, whitelist)) { entities.add(entity); } } return entities; }
@Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((getDatasetName() == null) ? 0 : getDatasetName().hashCode()); return result; } }
Optional<SourceEntity> sourceEntity = SourceEntity.fromState(previousWus); if (!sourceEntity.isPresent()) { log.warn("Missing source entity for WorkUnit state: " + previousWus);
Histogram histogram = getHistogram(sourceEntity.getSourceEntityName(), watermarkColumn, state, partition);
Assert.assertTrue(res.contains(SourceEntity.fromSourceEntityName("Table2")), "Missing Table2 in " + res); Assert.assertTrue(res.contains(SourceEntity.fromSourceEntityName("Table3")), "Missing Table3 in " + res); Assert.assertTrue(res.contains(SourceEntity.fromSourceEntityName("Table3")), "Missing Table3 in " + res); SourceEntity expected = new SourceEntity("Table3", "PropShouldNotBeIgnored"); Assert.assertEquals(res.size(), 1); Assert.assertTrue(res.contains(expected), "Missing Table3 in " + res);
SourceEntity sourceEntity = SourceEntity.fromSourceEntityName(sourceEntityName); sourceEntities[i] = sourceEntity; extracts[i] = new Extract(TableType.APPEND_ONLY, "", sourceEntity.getDestTableName()); for (int j = 0; j < 3; ++j) { WorkUnit wu = new WorkUnit(extracts[i]); wu.setProp(ConfigurationKeys.SOURCE_ENTITY, sourceEntity.getSourceEntityName()); wu.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, 10 * i); wu.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, sourceEntity.getDestTableName()); WorkUnitState wuState = new WorkUnitState(wu, prevJobState); wuState.setProp(ConfigurationKeys.WORK_UNIT_STATE_RUNTIME_HIGH_WATER_MARK, 20 * i);
@Test public void testSourceEntity() { SourceEntity se1 = SourceEntity.fromSourceEntityName("SourceEntity1"); Assert.assertEquals(se1.getSourceEntityName(), "SourceEntity1"); Assert.assertEquals(se1.getDestTableName(), "SourceEntity1"); Assert.assertEquals(se1.getDatasetName(), "SourceEntity1"); SourceEntity se2 = SourceEntity.fromSourceEntityName("SourceEntity$2"); Assert.assertEquals(se2.getSourceEntityName(), "SourceEntity$2"); Assert.assertEquals(se2.getDestTableName(), "SourceEntity_2"); Assert.assertEquals(se2.getDatasetName(), "SourceEntity$2"); State st1 = new State(); st1.setProp(ConfigurationKeys.SOURCE_ENTITY, "SourceEntity3"); st1.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, "SourceEntity3_Table"); Optional<SourceEntity> se3 = SourceEntity.fromState(st1); Assert.assertTrue(se3.isPresent()); Assert.assertEquals(se3.get().getSourceEntityName(), "SourceEntity3"); Assert.assertEquals(se3.get().getDestTableName(), "SourceEntity3_Table"); Assert.assertEquals(se3.get().getDatasetName(), "SourceEntity3"); Assert.assertEquals(se3.get(), new SourceEntity("SourceEntity3", "SourceEntity3_Table")); State st2 = new State(); st2.setProp(ConfigurationKeys.SOURCE_ENTITY, "SourceEntity$4"); Optional<SourceEntity> se4 = SourceEntity.fromState(st2); Assert.assertTrue(se4.isPresent()); Assert.assertEquals(se4.get(), SourceEntity.fromSourceEntityName("SourceEntity$4")); State st3 = new State(); st3.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, "Table5"); Optional<SourceEntity> se5 = SourceEntity.fromState(st3); Assert.assertTrue(se5.isPresent()); Assert.assertEquals(se5.get(), SourceEntity.fromSourceEntityName("Table5")); }
@Test void testSourceLineageInfo() { SourceState sourceState = new SourceState(); sourceState.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "salesforce"); sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "snapshot_append"); sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true); sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140213000000,20170407152123"); sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT"); QueryBasedSource.SourceEntity sourceEntity = QueryBasedSource.SourceEntity.fromSourceEntityName("contacts"); SalesforceSource source = new SalesforceSource(new LineageInfo(ConfigFactory.empty())); List<WorkUnit> workUnits = source.generateWorkUnits(sourceEntity, sourceState, 20140213000000L); Assert.assertEquals(workUnits.size(), 1); String expected = "{\"object-type\":\"org.apache.gobblin.dataset.DatasetDescriptor\"," + "\"object-data\":{\"platform\":\"salesforce\",\"metadata\":{},\"name\":\"contacts\"}}"; Assert.assertEquals(expected, workUnits.get(0).getProp("gobblin.event.lineage.source")); Assert.assertEquals(workUnits.get(0).getProp("gobblin.event.lineage.name"), "contacts"); }