@Override public List<WorkUnit> getWorkunits(SourceState state) { initLogger(state); lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
@Override protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) { DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_SALESFORCE, entity.getSourceEntityName()); if (lineageInfo.isPresent()) { lineageInfo.get().setSource(source, workUnit); } }
/** * Load all lineage information from {@link State}s of a dataset * * @param states All states which belong to the same dataset * @return A collection of {@link LineageEventBuilder}s put in the state */ public static Collection<LineageEventBuilder> load(Collection<? extends State> states) { Preconditions.checkArgument(states != null && !states.isEmpty()); Set<LineageEventBuilder> allEvents = Sets.newHashSet(); for (State state : states) { Map<String, Set<LineageEventBuilder>> branchedEvents = load(state); branchedEvents.values().forEach(allEvents::addAll); } return allEvents; }
/** * Put data {@link Descriptor}s of a destination dataset to a state * * @param descriptors It can be a single item list which just has the dataset descriptor or a list * of dataset partition descriptors */ public void putDestination(List<Descriptor> descriptors, int branchId, State state) { if (!hasLineageInfo(state)) { log.warn("State has no lineage info but branch " + branchId + " puts {} descriptors", descriptors.size()); return; } log.info(String.format("Put destination %s for branch %d", Descriptor.toJson(descriptors), branchId)); synchronized (state.getProp(getKey(NAME_KEY))) { List<Descriptor> resolvedDescriptors = new ArrayList<>(); for (Descriptor descriptor : descriptors) { Descriptor resolvedDescriptor = resolver.resolve(descriptor, state); if (resolvedDescriptor == null) { continue; } resolvedDescriptors.add(resolvedDescriptor); } state.setProp(getKey(BRANCH, branchId, LineageEventBuilder.DESTINATION), Descriptor.toJson(resolvedDescriptors)); } }
private void maySubmitLineageEvent(JobState.DatasetState datasetState) { Collection<TaskState> allStates = datasetState.getTaskStates(); Collection<TaskState> states = Lists.newArrayList(); // Filter out failed states or states that don't have lineage info for (TaskState state : allStates) { if (state.getWorkingState() == WorkUnitState.WorkingState.COMMITTED && LineageInfo.hasLineageInfo(state)) { states.add(state); } } if (states.size() == 0) { log.info("Will not submit lineage events as no state contains lineage info"); return; } try { if (StringUtils.isEmpty(datasetUrn)) { // This dataset may contain different kinds of LineageEvent for (Map.Entry<String, Collection<TaskState>> entry : aggregateByLineageEvent(states).entrySet()) { submitLineageEvent(entry.getKey(), entry.getValue()); } } else { submitLineageEvent(datasetUrn, states); } } finally { // Purge lineage info from all states for (TaskState taskState : allStates) { LineageInfo.purgeLineageInfo(taskState); } } }
LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); Assert.assertTrue(state.contains("gobblin.event.lineage.branch.1.destination")); Collection<LineageEventBuilder> events = LineageInfo.load(ImmutableList.of(state)); Assert.assertTrue(events.size() == 4);
@Test public void testEventForPartitionedDataset() { final String topic = "testTopic"; final String kafka = "kafka"; final String hdfs = "hdfs"; final String path = "/data/tracking/PageViewEvent"; final String partitionName = "hourly/2018/08/15/15"; State state = new State(); LineageInfo lineageInfo = getLineageInfo(); DatasetDescriptor source = new DatasetDescriptor(kafka, topic); lineageInfo.setSource(source, state); DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path); PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset); lineageInfo.putDestination(destination, 0, state); Map<String, Set<LineageEventBuilder>> events = LineageInfo.load(state); LineageEventBuilder event = first(events.get("0")); verify(event, topic, source, destination); // Verify gobblin tracking event GobblinTrackingEvent trackingEvent = event.build(); Assert.assertEquals(LineageEventBuilder.isLineageEvent(trackingEvent), true); Assert.assertEquals(LineageEventBuilder.fromEvent(trackingEvent), event); }
/** * Test lineage info is set on publishing single task */ @Test public void testPublishSingleTask() throws IOException { WorkUnitState state = buildTaskState(1); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); publisher.publishData(state); Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertFalse(state.contains("gobblin.event.lineage.branch.1.destination")); }
workUnit.setProp(ConfigurationKeys.JOB_ID_KEY, "123456"); Optional<LineageInfo> lineageInfo = LineageInfo.getLineageInfo(getSharedJobBroker(workUnit.getProperties())); HiveAvroToOrcSource src = new HiveAvroToOrcSource(); Assert.assertTrue(LineageUtils.shouldSetLineageInfo(workUnit)); Collection<LineageEventBuilder> lineageEventBuilders = LineageInfo.load(Collections.singleton(taskState)); Assert.assertEquals(lineageEventBuilders.size(), 2);
/** * Put a {@link DatasetDescriptor} of a destination dataset to a state * * <p> * Only the {@link org.apache.gobblin.writer.DataWriter} or {@link org.apache.gobblin.publisher.DataPublisher} * is supposed to put the destination dataset information. Since different branches may concurrently put, * the method is implemented to be threadsafe * </p> * * @deprecated Use {@link #putDestination(List, int, State)} */ @Deprecated public void putDestination(Descriptor destination, int branchId, State state) { putDestination(Lists.newArrayList(destination), branchId, state); }
@Override public SharedResourceFactoryResponse<LineageInfo> createResource(SharedResourcesBroker<GobblinScopeTypes> broker, ScopedConfigView<GobblinScopeTypes, EmptyKey> config) throws NotConfiguredException { return new ResourceInstance<>(new LineageInfo(config.getConfig())); }
private static Map<String, Collection<TaskState>> aggregateByLineageEvent(Collection<TaskState> states) { Map<String, Collection<TaskState>> statesByEvents = Maps.newHashMap(); for (TaskState state : states) { String eventName = LineageInfo.getFullEventName(state); Collection<TaskState> statesForEvent = statesByEvents.computeIfAbsent(eventName, k -> Lists.newArrayList()); statesForEvent.add(state); } return statesByEvents; } }
LineageInfo lineageInfo = getLineageInfo(); DatasetDescriptor source = new DatasetDescriptor(kafka, topic); lineageInfo.setSource(source, state0); DatasetDescriptor destination00 = new DatasetDescriptor(hdfs, "/data/tracking"); destination00.addMetadata(branch, "0"); lineageInfo.putDestination(destination00, 0, state0); DatasetDescriptor destination01 = new DatasetDescriptor(mysql, "kafka.testTopic"); destination01.addMetadata(branch, "1"); lineageInfo.putDestination(destination01, 1, state0); Map<String, Set<LineageEventBuilder>> events = LineageInfo.load(state0); verify(first(events.get("0")), topic, source, destination00); verify(first(events.get("1")), topic, source, destination01); lineageInfo.setSource(source, state1); List<State> states = Lists.newArrayList(); states.add(state0); Collection<LineageEventBuilder> eventsList = LineageInfo.load(states); Assert.assertTrue(eventsList.size() == 2); Assert.assertEquals(getLineageEvent(eventsList, 0, hdfs), first(events.get("0"))); lineageInfo.putDestination(destination12, 2, state1); eventsList = LineageInfo.load(states); Assert.assertTrue(eventsList.size() == 3); Assert.assertEquals(getLineageEvent(eventsList, 0, hdfs), first(events.get("0"))); lineageInfo.putDestination(destination10, 0, state1); DatasetDescriptor destination11 = new DatasetDescriptor("hive", "kafka.testTopic1");
/** * Test lineage info is set on publishing multiple tasks */ @Test public void testPublishMultiTasks() throws IOException { WorkUnitState state1 = buildTaskState(2); WorkUnitState state2 = buildTaskState(2); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state1.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state1); lineageInfo.setSource(source, state2); BaseDataPublisher publisher = new BaseDataPublisher(state1); publisher.publishData(ImmutableList.of(state1, state2)); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.1.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.1.destination")); }
@VisibleForTesting public static void setDestLineageInfo(WorkUnitState wus, Optional<LineageInfo> lineageInfo) { HiveWorkUnit hiveWorkUnit = new HiveWorkUnit(wus.getWorkunit()); ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) hiveWorkUnit.getHiveDataset(); List<DatasetDescriptor> destDatasets = convertibleHiveDataset.getDestDatasets(); for (int i = 0; i < destDatasets.size(); i++) { if (lineageInfo.isPresent()) { lineageInfo.get().putDestination(destDatasets.get(i), i + 1, wus); } } }
/** * Put data {@link Descriptor}s of a destination dataset to a state * * @param descriptors It can be a single item list which just has the dataset descriptor or a list * of dataset partition descriptors */ public void putDestination(List<Descriptor> descriptors, int branchId, State state) { if (!hasLineageInfo(state)) { log.warn("State has no lineage info but branch " + branchId + " puts {} descriptors", descriptors.size()); return; } log.info(String.format("Put destination %s for branch %d", Descriptor.toJson(descriptors), branchId)); synchronized (state.getProp(getKey(NAME_KEY))) { List<Descriptor> resolvedDescriptors = new ArrayList<>(); for (Descriptor descriptor : descriptors) { Descriptor resolvedDescriptor = resolver.resolve(descriptor, state); if (resolvedDescriptor == null) { continue; } resolvedDescriptors.add(resolvedDescriptor); } state.setProp(getKey(BRANCH, branchId, LineageEventBuilder.DESTINATION), Descriptor.toJson(resolvedDescriptors)); } }
private void maySubmitLineageEvent(JobState.DatasetState datasetState) { Collection<TaskState> allStates = datasetState.getTaskStates(); Collection<TaskState> states = Lists.newArrayList(); // Filter out failed states or states that don't have lineage info for (TaskState state : allStates) { if (state.getWorkingState() == WorkUnitState.WorkingState.COMMITTED && LineageInfo.hasLineageInfo(state)) { states.add(state); } } if (states.size() == 0) { log.info("Will not submit lineage events as no state contains lineage info"); return; } try { if (StringUtils.isEmpty(datasetUrn)) { // This dataset may contain different kinds of LineageEvent for (Map.Entry<String, Collection<TaskState>> entry : aggregateByLineageEvent(states).entrySet()) { submitLineageEvent(entry.getKey(), entry.getValue()); } } else { submitLineageEvent(datasetUrn, states); } } finally { // Purge lineage info from all states for (TaskState taskState : allStates) { LineageInfo.purgeLineageInfo(taskState); } } }
@Test void testSourceLineageInfo() { SourceState sourceState = new SourceState(); sourceState.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "salesforce"); sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "snapshot_append"); sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true); sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140213000000,20170407152123"); sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT"); QueryBasedSource.SourceEntity sourceEntity = QueryBasedSource.SourceEntity.fromSourceEntityName("contacts"); SalesforceSource source = new SalesforceSource(new LineageInfo(ConfigFactory.empty())); List<WorkUnit> workUnits = source.generateWorkUnits(sourceEntity, sourceState, 20140213000000L); Assert.assertEquals(workUnits.size(), 1); String expected = "{\"object-type\":\"org.apache.gobblin.dataset.DatasetDescriptor\"," + "\"object-data\":{\"platform\":\"salesforce\",\"metadata\":{},\"name\":\"contacts\"}}"; Assert.assertEquals(expected, workUnits.get(0).getProp("gobblin.event.lineage.source")); Assert.assertEquals(workUnits.get(0).getProp("gobblin.event.lineage.name"), "contacts"); }
private static Map<String, Collection<TaskState>> aggregateByLineageEvent(Collection<TaskState> states) { Map<String, Collection<TaskState>> statesByEvents = Maps.newHashMap(); for (TaskState state : states) { String eventName = LineageInfo.getFullEventName(state); Collection<TaskState> statesForEvent = statesByEvents.computeIfAbsent(eventName, k -> Lists.newArrayList()); statesForEvent.add(state); } return statesByEvents; } }
lineageInfo = LineageInfo.getLineageInfo(((SourceState) state).getBroker()); } else if (state instanceof WorkUnitState) { lineageInfo = LineageInfo.getLineageInfo(((WorkUnitState) state).getTaskBrokerNullable()); } else { lineageInfo = Optional.absent();