@Override public DatasetDescriptor copy() { return new DatasetDescriptor(this); }
/** * Deserialize a {@link DatasetDescriptor} from a string map * * @deprecated use {@link Descriptor#deserialize(String)} */ @Deprecated public static DatasetDescriptor fromDataMap(Map<String, String> dataMap) { DatasetDescriptor descriptor = new DatasetDescriptor(dataMap.get(PLATFORM_KEY), dataMap.get(NAME_KEY)); dataMap.forEach((key, value) -> { if (!key.equals(PLATFORM_KEY) && !key.equals(NAME_KEY)) { descriptor.addMetadata(key, value); } }); return descriptor; } }
@Override protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) { DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_SALESFORCE, entity.getSourceEntityName()); if (lineageInfo.isPresent()) { lineageInfo.get().setSource(source, workUnit); } }
@Override public Descriptor getDataDescriptor() { // Dataset is resulted from WriterUtils.getWriterOutputDir(properties, this.numBranches, this.branchId) // The writer dataset might not be same as the published dataset DatasetDescriptor datasetDescriptor = new DatasetDescriptor(fs.getScheme(), outputFile.getParent().toString()); if (partitionKey == null) { return datasetDescriptor; } return new PartitionDescriptor(partitionKey, datasetDescriptor); }
@Override public DatasetDescriptor resolve(DatasetDescriptor raw, State state) { ImmutableMap<String, String> metadata = raw.getMetadata(); Preconditions.checkArgument(metadata.containsKey(DatasetConstants.FS_SCHEME), String.format("Hive Dataset Descriptor must contain metadata %s to create Hdfs Dataset Descriptor", DatasetConstants.FS_SCHEME)); Preconditions.checkArgument(metadata.containsKey(DatasetConstants.FS_SCHEME), String.format("Hive Dataset Descriptor must contain metadata %s to create Hdfs Dataset Descriptor", DatasetConstants.FS_LOCATION)); DatasetDescriptor datasetDescriptor = new DatasetDescriptor(metadata.get(DatasetConstants.FS_SCHEME), metadata.get(DatasetConstants.FS_LOCATION)); datasetDescriptor.addMetadata(HIVE_TABLE, raw.getName()); return datasetDescriptor; } }
/** * Create destination dataset descriptor */ protected DatasetDescriptor createDestinationDescriptor(WorkUnitState state, int branchId) { Path publisherOutputDir = getPublisherOutputDir(state, branchId); FileSystem fs = this.publisherFileSystemByBranches.get(branchId); DatasetDescriptor destination = new DatasetDescriptor(fs.getScheme(), publisherOutputDir.toString()); destination.addMetadata(DatasetConstants.FS_URI, fs.getUri().toString()); destination.addMetadata(DatasetConstants.BRANCH, String.valueOf(branchId)); return destination; }
DatasetDescriptor getDestinationDataset() { String destinationTable = this.getTargetDatabase() + "." + this.getTargetTable(); DatasetDescriptor destinationDataset = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destinationTable); destinationDataset.addMetadata(DatasetConstants.FS_URI, this.getTargetFs().getUri().toString()); return destinationDataset; } }
protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) { String host = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_HOST_NAME); String port = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_PORT); String database = sourceState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA); String connectionUrl = "jdbc:mysql://" + host.trim() + ":" + port + "/" + database.trim(); DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_MYSQL, database + "." + entity.getSourceEntityName()); source.addMetadata(DatasetConstants.CONNECTION_URL, connectionUrl); if (lineageInfo.isPresent()) { lineageInfo.get().setSource(source, workUnit); } } }
@Override public Descriptor getDataDescriptor() { DatasetDescriptor dataset = new DatasetDescriptor("testPlatform", "testDataset"); return new PartitionDescriptor(this.partition, dataset); } }
private DatasetDescriptor createSourceDataset() { try { String sourceTable = getTable().getDbName() + "." + getTable().getTableName(); DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable); Path sourcePath = getTable().getDataLocation(); log.info(String.format("[%s]Source path %s being used in conversion", this.getClass().getName(), sourcePath)); String sourceLocation = Path.getPathWithoutSchemeAndAuthority(sourcePath).toString(); FileSystem sourceFs = sourcePath.getFileSystem(new Configuration()); source.addMetadata(DatasetConstants.FS_SCHEME, sourceFs.getScheme()); source.addMetadata(DatasetConstants.FS_LOCATION, sourceLocation); return source; } catch (IOException e) { throw new RuntimeException(e); } }
DatasetDescriptor getSourceDataset() { String sourceTable = dataset.getTable().getDbName() + "." + dataset.getTable().getTableName(); DatasetDescriptor sourceDataset = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable); sourceDataset.addMetadata(DatasetConstants.FS_URI, dataset.getFs().getUri().toString()); return sourceDataset; }
private List<DatasetDescriptor> createDestDatasets() { List<DatasetDescriptor> destDatasets = new ArrayList<>(); for (String format : getDestFormats()) { Optional<ConversionConfig> conversionConfigForFormat = getConversionConfigForFormat(format); if (!conversionConfigForFormat.isPresent()) { continue; } String destTable = conversionConfigForFormat.get().getDestinationDbName() + "." + conversionConfigForFormat.get() .getDestinationTableName(); DatasetDescriptor dest = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destTable); String destLocation = conversionConfigForFormat.get().getDestinationDataPath() + Path.SEPARATOR + "final"; dest.addMetadata(DatasetConstants.FS_SCHEME, getSourceDataset().getMetadata().get(DatasetConstants.FS_SCHEME)); dest.addMetadata(DatasetConstants.FS_LOCATION, destLocation); destDatasets.add(dest); } return destDatasets; }
@Test public void testPartitionDescriptor() { DatasetDescriptor dataset = new DatasetDescriptor("hdfs", "/data/tracking/PageViewEvent"); String partitionName = "hourly/2018/08/14/18"; PartitionDescriptor partition = new PartitionDescriptor(partitionName, dataset); // Test copy with new dataset DatasetDescriptor dataset2 = new DatasetDescriptor("hive", "/data/tracking/PageViewEvent"); Descriptor partition2 = partition.copyWithNewDataset(dataset2); Assert.assertEquals(partition2.getName(), partition.getName()); Assert.assertEquals(((PartitionDescriptor)partition2).getDataset(), dataset2); // Test copy PartitionDescriptor partition3 = partition.copy(); Assert.assertEquals(partition3.getDataset(), dataset); Assert.assertEquals(partition3.getName(), partitionName); } }
@Override public DatasetDescriptor resolve(DatasetDescriptor raw, State state) { DatasetDescriptor descriptor = new DatasetDescriptor(raw.getPlatform(), DATASET_NAME); raw.getMetadata().forEach(descriptor::addMetadata); return descriptor; } }
@Test public void testAsDescriptorResolver() { DescriptorResolver resolver = new TestDatasetResolver(); State state = new State(); // Test dataset descriptor DatasetDescriptor dataset = new DatasetDescriptor("hdfs", "/data/tracking/PageViewEvent"); Descriptor descriptor = resolver.resolve(dataset, state); Assert.assertTrue(descriptor.getClass().isAssignableFrom(DatasetDescriptor.class)); Assert.assertEquals(descriptor.getName(), TestDatasetResolver.DATASET_NAME); // Test partition descriptor String partitionName = "hourly/2018/08/14/18"; PartitionDescriptor partition = new PartitionDescriptor(partitionName, dataset); descriptor = resolver.resolve(partition, state); Assert.assertTrue(descriptor.getClass().isAssignableFrom(DatasetDescriptor.class)); Assert.assertEquals(descriptor.getName(), TestDatasetResolver.DATASET_NAME); // Test unsupported descriptor Assert.assertEquals(resolver.resolve(new MockDescriptor("test"), state), null); }
@Test public void testDatasetDescriptor() { DatasetDescriptor dataset = new DatasetDescriptor("hdfs", "/data/tracking/PageViewEvent"); dataset.addMetadata("fsUri", "hdfs://test.com:2018"); DatasetDescriptor copy = dataset.copy(); Assert.assertEquals(copy.getName(), dataset.getName()); Assert.assertEquals(copy.getPlatform(), dataset.getPlatform()); Assert.assertEquals(copy.getMetadata(), dataset.getMetadata()); Assert.assertEquals(dataset, copy); Assert.assertEquals(dataset.hashCode(), copy.hashCode()); }
@Test public void testSerializeDeserialze() throws Exception { CopyableFile copyableFile = new CopyableFile(new FileStatus(10, false, 12, 100, 12345, new Path("/path")), new Path("/destination"), new OwnerAndPermission("owner", "group", FsPermission.getDefault()), Lists.newArrayList(new OwnerAndPermission("owner2", "group2", FsPermission.getDefault())), "checksum".getBytes(), PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps .<String, String>newHashMap(), ""); DatasetDescriptor dataset = new DatasetDescriptor("hive", "db.table"); PartitionDescriptor descriptor = new PartitionDescriptor("datepartition=2018/09/05", dataset); copyableFile.setDestinationData(descriptor); String s = CopyEntity.serialize(copyableFile); CopyEntity de = CopyEntity.deserialize(s); Assert.assertEquals(de, copyableFile); }
/** * Test lineage info is set on publishing single task */ @Test public void testPublishSingleTask() throws IOException { WorkUnitState state = buildTaskState(1); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); publisher.publishData(state); Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertFalse(state.contains("gobblin.event.lineage.branch.1.destination")); }
/** * Test lineage info is set on publishing multiple tasks */ @Test public void testPublishMultiTasks() throws IOException { WorkUnitState state1 = buildTaskState(2); WorkUnitState state2 = buildTaskState(2); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state1.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state1); lineageInfo.setSource(source, state2); BaseDataPublisher publisher = new BaseDataPublisher(state1); publisher.publishData(ImmutableList.of(state1, state2)); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.1.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.1.destination")); }
@Test public void testEventForPartitionedDataset() { final String topic = "testTopic"; final String kafka = "kafka"; final String hdfs = "hdfs"; final String path = "/data/tracking/PageViewEvent"; final String partitionName = "hourly/2018/08/15/15"; State state = new State(); LineageInfo lineageInfo = getLineageInfo(); DatasetDescriptor source = new DatasetDescriptor(kafka, topic); lineageInfo.setSource(source, state); DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path); PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset); lineageInfo.putDestination(destination, 0, state); Map<String, Set<LineageEventBuilder>> events = LineageInfo.load(state); LineageEventBuilder event = first(events.get("0")); verify(event, topic, source, destination); // Verify gobblin tracking event GobblinTrackingEvent trackingEvent = event.build(); Assert.assertEquals(LineageEventBuilder.isLineageEvent(trackingEvent), true); Assert.assertEquals(LineageEventBuilder.fromEvent(trackingEvent), event); }