/** * Put a {@link DatasetDescriptor} of a destination dataset to a state * * <p> * Only the {@link org.apache.gobblin.writer.DataWriter} or {@link org.apache.gobblin.publisher.DataPublisher} * is supposed to put the destination dataset information. Since different branches may concurrently put, * the method is implemented to be threadsafe * </p> * * @deprecated Use {@link #putDestination(List, int, State)} */ @Deprecated public void putDestination(Descriptor destination, int branchId, State state) { putDestination(Lists.newArrayList(destination), branchId, state); }
@VisibleForTesting public static void setDestLineageInfo(WorkUnitState wus, Optional<LineageInfo> lineageInfo) { HiveWorkUnit hiveWorkUnit = new HiveWorkUnit(wus.getWorkunit()); ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) hiveWorkUnit.getHiveDataset(); List<DatasetDescriptor> destDatasets = convertibleHiveDataset.getDestDatasets(); for (int i = 0; i < destDatasets.size(); i++) { if (lineageInfo.isPresent()) { lineageInfo.get().putDestination(destDatasets.get(i), i + 1, wus); } } }
private void addLineageInfo(WorkUnitState state, int branchId) { if (!this.lineageInfo.isPresent()) { LOG.info("Will not add lineage info"); return; } // Final dataset descriptor DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId); List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId); List<Descriptor> descriptors = new ArrayList<>(); if (partitions.size() == 0) { // Report as dataset level lineage descriptors.add(datasetDescriptor); } else { // Report as partition level lineage for (PartitionDescriptor partition : partitions) { descriptors.add(partition.copyWithNewDataset(datasetDescriptor)); } } this.lineageInfo.get().putDestination(descriptors, branchId, state); }
lineageInfo.get().putDestination(copyableFile.getDestinationData(), 0, wus);
DatasetDescriptor destination00 = new DatasetDescriptor(hdfs, "/data/tracking"); destination00.addMetadata(branch, "0"); lineageInfo.putDestination(destination00, 0, state0); DatasetDescriptor destination01 = new DatasetDescriptor(mysql, "kafka.testTopic"); destination01.addMetadata(branch, "1"); lineageInfo.putDestination(destination01, 1, state0); lineageInfo.putDestination(destination12, 2, state1); eventsList = LineageInfo.load(states); Assert.assertTrue(eventsList.size() == 3); lineageInfo.putDestination(destination10, 0, state1); DatasetDescriptor destination11 = new DatasetDescriptor("hive", "kafka.testTopic1"); destination11.addMetadata(branch, "1"); lineageInfo.putDestination(destination11, 1, state1); eventsList = LineageInfo.load(states); Assert.assertTrue(eventsList.size() == 4);
@Test public void testEventForPartitionedDataset() { final String topic = "testTopic"; final String kafka = "kafka"; final String hdfs = "hdfs"; final String path = "/data/tracking/PageViewEvent"; final String partitionName = "hourly/2018/08/15/15"; State state = new State(); LineageInfo lineageInfo = getLineageInfo(); DatasetDescriptor source = new DatasetDescriptor(kafka, topic); lineageInfo.setSource(source, state); DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path); PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset); lineageInfo.putDestination(destination, 0, state); Map<String, Set<LineageEventBuilder>> events = LineageInfo.load(state); LineageEventBuilder event = first(events.get("0")); verify(event, topic, source, destination); // Verify gobblin tracking event GobblinTrackingEvent trackingEvent = event.build(); Assert.assertEquals(LineageEventBuilder.isLineageEvent(trackingEvent), true); Assert.assertEquals(LineageEventBuilder.fromEvent(trackingEvent), event); }
/** * Put a {@link DatasetDescriptor} of a destination dataset to a state * * <p> * Only the {@link org.apache.gobblin.writer.DataWriter} or {@link org.apache.gobblin.publisher.DataPublisher} * is supposed to put the destination dataset information. Since different branches may concurrently put, * the method is implemented to be threadsafe * </p> * * @deprecated Use {@link #putDestination(List, int, State)} */ @Deprecated public void putDestination(Descriptor destination, int branchId, State state) { putDestination(Lists.newArrayList(destination), branchId, state); }
@VisibleForTesting public static void setDestLineageInfo(WorkUnitState wus, Optional<LineageInfo> lineageInfo) { HiveWorkUnit hiveWorkUnit = new HiveWorkUnit(wus.getWorkunit()); ConvertibleHiveDataset convertibleHiveDataset = (ConvertibleHiveDataset) hiveWorkUnit.getHiveDataset(); List<DatasetDescriptor> destDatasets = convertibleHiveDataset.getDestDatasets(); for (int i = 0; i < destDatasets.size(); i++) { if (lineageInfo.isPresent()) { lineageInfo.get().putDestination(destDatasets.get(i), i + 1, wus); } } }
private void addLineageInfo(WorkUnitState state, int branchId) { if (!this.lineageInfo.isPresent()) { LOG.info("Will not add lineage info"); return; } // Final dataset descriptor DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId); List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId); List<Descriptor> descriptors = new ArrayList<>(); if (partitions.size() == 0) { // Report as dataset level lineage descriptors.add(datasetDescriptor); } else { // Report as partition level lineage for (PartitionDescriptor partition : partitions) { descriptors.add(partition.copyWithNewDataset(datasetDescriptor)); } } this.lineageInfo.get().putDestination(descriptors, branchId, state); }
lineageInfo.get().putDestination(copyableFile.getDestinationData(), 0, wus);