@Override public Descriptor getDataDescriptor() { // Dataset is resulted from WriterUtils.getWriterOutputDir(properties, this.numBranches, this.branchId) // The writer dataset might not be same as the published dataset DatasetDescriptor datasetDescriptor = new DatasetDescriptor(fs.getScheme(), outputFile.getParent().toString()); if (partitionKey == null) { return datasetDescriptor; } return new PartitionDescriptor(partitionKey, datasetDescriptor); }
@Test public void testPartitionDescriptor() { DatasetDescriptor dataset = new DatasetDescriptor("hdfs", "/data/tracking/PageViewEvent"); String partitionName = "hourly/2018/08/14/18"; PartitionDescriptor partition = new PartitionDescriptor(partitionName, dataset); // Test copy with new dataset DatasetDescriptor dataset2 = new DatasetDescriptor("hive", "/data/tracking/PageViewEvent"); Descriptor partition2 = partition.copyWithNewDataset(dataset2); Assert.assertEquals(partition2.getName(), partition.getName()); Assert.assertEquals(((PartitionDescriptor)partition2).getDataset(), dataset2); // Test copy PartitionDescriptor partition3 = partition.copy(); Assert.assertEquals(partition3.getDataset(), dataset); Assert.assertEquals(partition3.getName(), partitionName); } }
/** * Get the partition info of a work unit from the {@code state}. Then partition info will be removed from the * {@code state} to avoid persisting useless information * * <p> * In Gobblin, only the {@link PartitionedDataWriter} knows all partitions written for a work unit. Each partition * {@link DataWriter} decides the actual form of a dataset partition * </p> */ public static List<PartitionDescriptor> getPartitionInfoAndClean(State state, int branchId) { String partitionsKey = getPartitionsKey(branchId); String json = state.getProp(partitionsKey); if (Strings.isNullOrEmpty(json)) { return Lists.newArrayList(); } state.removeProp(partitionsKey); return PartitionDescriptor.fromPartitionJsonList(json); } }
@Override public PartitionDescriptor copy() { return new PartitionDescriptor(getName(), dataset); }
@Override default Descriptor resolve(Descriptor raw, State state) { DatasetDescriptor rawDataset; if (raw instanceof DatasetDescriptor) { rawDataset = (DatasetDescriptor) raw; } else if (raw instanceof PartitionDescriptor) { rawDataset = ((PartitionDescriptor) raw).getDataset(); } else { // type not supported return null; } return resolve(rawDataset, state); } }
private void addLineageInfo(WorkUnitState state, int branchId) { if (!this.lineageInfo.isPresent()) { LOG.info("Will not add lineage info"); return; } // Final dataset descriptor DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId); List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId); List<Descriptor> descriptors = new ArrayList<>(); if (partitions.size() == 0) { // Report as dataset level lineage descriptors.add(datasetDescriptor); } else { // Report as partition level lineage for (PartitionDescriptor partition : partitions) { descriptors.add(partition.copyWithNewDataset(datasetDescriptor)); } } this.lineageInfo.get().putDestination(descriptors, branchId, state); }
/** * Serialize partitions info to {@link #state} if they are any */ private void serializePartitionInfoToState() { List<PartitionDescriptor> descriptors = new ArrayList<>(); for (DataWriter writer : partitionWriters.asMap().values()) { Descriptor descriptor = writer.getDataDescriptor(); if (null == descriptor) { log.warn("Drop partition info as writer {} returns a null PartitionDescriptor", writer.toString()); continue; } if (!(descriptor instanceof PartitionDescriptor)) { log.warn("Drop partition info as writer {} does not return a PartitionDescriptor", writer.toString()); continue; } descriptors.add((PartitionDescriptor)descriptor); } if (descriptors.size() > 0) { state.setProp(getPartitionsKey(branchId), PartitionDescriptor.toPartitionJsonList(descriptors)); } else { log.info("Partitions info not available. Will not serialize partitions"); } }
public PartitionDescriptor copyWithNewDataset(DatasetDescriptor dataset) { return new PartitionDescriptor(getName(), dataset); }
@Override public int hashCode() { int result = dataset.hashCode(); result = 31 * result + getName().hashCode(); return result; }
@Override default Descriptor resolve(Descriptor raw, State state) { DatasetDescriptor rawDataset; if (raw instanceof DatasetDescriptor) { rawDataset = (DatasetDescriptor) raw; } else if (raw instanceof PartitionDescriptor) { rawDataset = ((PartitionDescriptor) raw).getDataset(); } else { // type not supported return null; } return resolve(rawDataset, state); } }
private void addLineageInfo(WorkUnitState state, int branchId) { if (!this.lineageInfo.isPresent()) { LOG.info("Will not add lineage info"); return; } // Final dataset descriptor DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId); List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId); List<Descriptor> descriptors = new ArrayList<>(); if (partitions.size() == 0) { // Report as dataset level lineage descriptors.add(datasetDescriptor); } else { // Report as partition level lineage for (PartitionDescriptor partition : partitions) { descriptors.add(partition.copyWithNewDataset(datasetDescriptor)); } } this.lineageInfo.get().putDestination(descriptors, branchId, state); }
/** * Serialize partitions info to {@link #state} if they are any */ private void serializePartitionInfoToState() { List<PartitionDescriptor> descriptors = new ArrayList<>(); for (DataWriter writer : partitionWriters.asMap().values()) { Descriptor descriptor = writer.getDataDescriptor(); if (null == descriptor) { log.warn("Drop partition info as writer {} returns a null PartitionDescriptor", writer.toString()); continue; } if (!(descriptor instanceof PartitionDescriptor)) { log.warn("Drop partition info as writer {} does not return a PartitionDescriptor", writer.toString()); continue; } descriptors.add((PartitionDescriptor)descriptor); } if (descriptors.size() > 0) { state.setProp(getPartitionsKey(branchId), PartitionDescriptor.toPartitionJsonList(descriptors)); } else { log.info("Partitions info not available. Will not serialize partitions"); } }
@Override public Descriptor getDataDescriptor() { DatasetDescriptor dataset = new DatasetDescriptor("testPlatform", "testDataset"); return new PartitionDescriptor(this.partition, dataset); } }
@Override public PartitionDescriptor copy() { return new PartitionDescriptor(getName(), dataset); }
/** * Get the partition info of a work unit from the {@code state}. Then partition info will be removed from the * {@code state} to avoid persisting useless information * * <p> * In Gobblin, only the {@link PartitionedDataWriter} knows all partitions written for a work unit. Each partition * {@link DataWriter} decides the actual form of a dataset partition * </p> */ public static List<PartitionDescriptor> getPartitionInfoAndClean(State state, int branchId) { String partitionsKey = getPartitionsKey(branchId); String json = state.getProp(partitionsKey); if (Strings.isNullOrEmpty(json)) { return Lists.newArrayList(); } state.removeProp(partitionsKey); return PartitionDescriptor.fromPartitionJsonList(json); } }
@Test public void testAsDescriptorResolver() { DescriptorResolver resolver = new TestDatasetResolver(); State state = new State(); // Test dataset descriptor DatasetDescriptor dataset = new DatasetDescriptor("hdfs", "/data/tracking/PageViewEvent"); Descriptor descriptor = resolver.resolve(dataset, state); Assert.assertTrue(descriptor.getClass().isAssignableFrom(DatasetDescriptor.class)); Assert.assertEquals(descriptor.getName(), TestDatasetResolver.DATASET_NAME); // Test partition descriptor String partitionName = "hourly/2018/08/14/18"; PartitionDescriptor partition = new PartitionDescriptor(partitionName, dataset); descriptor = resolver.resolve(partition, state); Assert.assertTrue(descriptor.getClass().isAssignableFrom(DatasetDescriptor.class)); Assert.assertEquals(descriptor.getName(), TestDatasetResolver.DATASET_NAME); // Test unsupported descriptor Assert.assertEquals(resolver.resolve(new MockDescriptor("test"), state), null); }
public PartitionDescriptor copyWithNewDataset(DatasetDescriptor dataset) { return new PartitionDescriptor(getName(), dataset); }
@Override public int hashCode() { int result = dataset.hashCode(); result = 31 * result + getName().hashCode(); return result; }