/** * Get the {@link PartitionStrategy}, if this dataset is partitioned. Calling * this method on a non-partitioned dataset is an error. Instead, use the * {@link #isPartitioned()} method prior to invocation. */ public PartitionStrategy getPartitionStrategy() { Preconditions .checkState( isPartitioned(), "Attempt to retrieve the partition strategy on a non-partitioned descriptor:%s", this); return partitionStrategy; }
public void addExistingPartitions() { if (partitionListener != null && descriptor.isPartitioned()) { for (Iterator<Path> i = dirIterator(); i.hasNext(); ) { Path partition = i.next(); LOG.info("Adding partition {}", partition); partitionListener.partitionAdded(namespace, name, partition.toString()); } } }
@Override @SuppressWarnings("deprecation") public void dropPartition(PartitionKey key) { Preconditions.checkState(descriptor.isPartitioned(), "Attempt to drop a partition on a non-partitioned dataset (name:%s)", name); Preconditions.checkNotNull(key, "Partition key may not be null"); LOG.debug("Dropping partition with key:{} dataset:{}", key, name); Path partitionDirectory = toDirectoryName(directory, key); try { if (!fileSystem.delete(partitionDirectory, true)) { throw new IOException("Partition directory " + partitionDirectory + " for key " + key + " does not exist"); } } catch (IOException e) { throw new DatasetIOException("Unable to locate or drop dataset partition directory " + partitionDirectory, e); } }
PathIterator pathIterator() { if (dataset.getDescriptor().isPartitioned()) { return new PathIterator(fs, root, partitionIterator()); } else { return new PathIterator(fs, root, null); } }
@Override public void setConf(Configuration configuration) { conf = configuration; View<E> view = load(configuration); String partitionDir = conf.get(KITE_PARTITION_DIR); if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf); } else { delegate = getDelegateInputFormat(view, conf); } }
@Override public void setConf(Configuration configuration) { conf = configuration; View<E> view = load(configuration); String partitionDir = conf.get(KITE_PARTITION_DIR); if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf); } else { delegate = getDelegateInputFormat(view, conf); } }
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext); Map<String, String> uriOptions = Registration.lookupDatasetUri( URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second(); Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext); if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return ((AbstractDataset<E>) dataset).filter(constraints); } else { return dataset; } }
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext); Map<String, String> uriOptions = Registration.lookupDatasetUri( URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second(); Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext); if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return ((AbstractDataset<E>) dataset).filter(constraints); } else { return dataset; } }
public static void checkPartitionedBy(DatasetDescriptor descriptor, String fieldName) { Preconditions.checkArgument(descriptor.isPartitioned(), "Descriptor %s is not partitioned", descriptor); Preconditions.checkArgument( Accessor.getDefault().hasPartitioner(descriptor.getPartitionStrategy(), fieldName), "Descriptor %s is not partitioned by '%s'", descriptor, fieldName); }
public Builder(Dataset dataset) { if (!dataset.getDescriptor().isPartitioned()) { throw new DatasetException("Dataset is not partitioned"); } this.strategy = dataset.getDescriptor().getPartitionStrategy(); this.values = Maps.newHashMap(); }
/** * Returns an iterator that provides all leaf-level directories in this view. * * @return leaf-directory iterator */ Iterator<Path> dirIterator() { if (dataset.getDescriptor().isPartitioned()) { return Iterators.transform(partitionIterator(), new Function<StorageKey, Path>() { @Override @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="NP_PARAMETER_MUST_BE_NONNULL_BUT_MARKED_AS_NULLABLE", justification="False positive, initialized above as non-null.") public Path apply(@Nullable StorageKey key) { return new Path(root, key.getPath()); } }); } else { return Iterators.singletonIterator(root); } }
@SuppressWarnings("unchecked") private static <E, V extends View<E>> V view(Dataset<E> dataset, Map<String, String> uriOptions) { if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return (V) ((AbstractDataset) dataset).filter(constraints); } else { return (V) dataset; } } }
private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) { // the SchemaManager stores schemas, so this embeds the column mapping and // partition strategy in the schema. the result is parsed by // AvroKeyEntitySchemaParser Schema schema = descriptor.getSchema(); if (descriptor.isColumnMapped()) { schema = ColumnMappingParser .embedColumnMapping(schema, descriptor.getColumnMapping()); } if (descriptor.isPartitioned()) { schema = PartitionStrategyParser .embedPartitionStrategy(schema, descriptor.getPartitionStrategy()); } return schema; }
private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) { // the SchemaManager stores schemas, so this embeds the column mapping and // partition strategy in the schema. the result is parsed by // AvroKeyEntitySchemaParser Schema schema = descriptor.getSchema(); if (descriptor.isColumnMapped()) { schema = ColumnMappingParser .embedColumnMapping(schema, descriptor.getColumnMapping()); } if (descriptor.isPartitioned()) { schema = PartitionStrategyParser .embedPartitionStrategy(schema, descriptor.getPartitionStrategy()); } return schema; }
@Override public DatasetWriter<E> newWriter() { checkSchemaForWrite(); AbstractDatasetWriter<E> writer; if (dataset.getDescriptor().isPartitioned()) { writer = PartitionedDatasetWriter.newWriter(this); } else { writer = FileSystemWriter.newWriter( fs, root, -1, -1 /* get from descriptor */, dataset.getDescriptor(), this.getAccessor().getWriteSchema()); } writer.initialize(); return writer; }
@Override public <E> Dataset<E> load(String namespace, String name, Class<E> type) { Preconditions.checkNotNull(namespace, "Namespace cannot be null"); Preconditions.checkNotNull(name, "Dataset name cannot be null"); LOG.debug("Loading dataset: {}", name); DatasetDescriptor descriptor = metadataProvider.load(namespace, name); FileSystemDataset<E> ds = new FileSystemDataset.Builder<E>() .namespace(namespace) .name(name) .configuration(conf) .descriptor(descriptor) .type(type) .uri(new URIBuilder(getUri(), namespace, name).build()) .partitionKey(descriptor.isPartitioned() ? new PartitionKey() : null) .partitionListener(getPartitionListener()) .build(); LOG.debug("Loaded dataset:{}", ds); return ds; }
boolean deleteAllUnsafe(boolean useTrash) { boolean deleted = false; if (dataset.getDescriptor().isPartitioned()) { for (StorageKey key : partitionIterator()) { deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, key.getPath()) : FileSystemUtil.cleanlyDelete(fs, root, key.getPath())) || deleted; if (listener != null) { // the relative path is the partition name, so we can simply delete it // in Hive listener.partitionDeleted(dataset.getNamespace(), dataset.getName(), key.getPath().toString()); } } } else { for (Path path : pathIterator()) { deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, path) : FileSystemUtil.cleanlyDelete(fs, root, path)) || deleted; } } return deleted; }
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleParquetFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schemaUri(USER_SCHEMA_URL) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor() .isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); }