return Datasets.load(uri).getDataset().getDescriptor().getSchema(); } else if ("resource".equals(uri.getScheme())) { try (InputStream in = Resources.getResource(uri.getSchemeSpecificPart()).openStream()) {
Format format = descriptor.getFormat(); Preconditions.checkArgument(allowedFormats().contains(format.getName()), "Unsupported format: " + format.getName()); Schema newSchema = descriptor.getSchema(); if (datasetSchema == null || !newSchema.equals(datasetSchema)) { this.datasetSchema = descriptor.getSchema();
@Override public AvroKeySchema parseKeySchema(String rawSchema) { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .build(); return new AvroKeySchema( descriptor.getSchema(), descriptor.getPartitionStrategy()); }
try { outputStream = fs.create(schemaPath, true /* overwrite */ ); outputStream.write(descriptor.getSchema().toString(true) .getBytes(Charsets.UTF_8)); outputStream.flush(); new Path(metadataLocation, SCHEMA_DIRECTORY_NAME)); manager.writeSchema(descriptor.getSchema()); properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName()); properties.setProperty(COMPRESSION_TYPE_FIELD_NAME, descriptor.getCompressionType().getName()); final URI dataLocation = descriptor.getLocation(); if (dataLocation != null) { properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString()); if (descriptor.isPartitioned()) { properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME, Accessor.getDefault().toExpression(descriptor.getPartitionStrategy())); for (String property : descriptor.listProperties()) { properties.setProperty(property, descriptor.getProperty(property));
private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) { // the SchemaManager stores schemas, so this embeds the column mapping and // partition strategy in the schema. the result is parsed by // AvroKeyEntitySchemaParser Schema schema = descriptor.getSchema(); if (descriptor.isColumnMapped()) { schema = ColumnMappingParser .embedColumnMapping(schema, descriptor.getColumnMapping()); } if (descriptor.isPartitioned()) { schema = PartitionStrategyParser .embedPartitionStrategy(schema, descriptor.getPartitionStrategy()); } return schema; }
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testLoad() { ensureCreated(); DatasetDescriptor loaded = provider.load(NAMESPACE, NAME); Assert.assertNotNull("DatasetDescriptor should be returned", loaded); Assert.assertEquals("Schema should match", testDescriptor.getSchema(), loaded.getSchema()); Assert.assertEquals("PartitionStrategy should match", testDescriptor.getPartitionStrategy(), loaded.getPartitionStrategy()); Assert.assertEquals("Format should match", testDescriptor.getFormat(), loaded.getFormat()); }
private static void printInfo(Logger console, Dataset<?> dataset) { DatasetDescriptor desc = dataset.getDescriptor(); String schema = ColumnMappingParser.removeEmbeddedMapping( PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema())) .toString(true); Collection<String> properties = desc.listProperties(); console.info("\nDataset \"{}\":", dataset.getName()); console.info("\tURI: \"{}\"", dataset.getUri()); console.info("\tSchema: {}", indent(schema)); if (desc.isPartitioned()) { console.info("\tPartition strategy: {}", indent(desc.getPartitionStrategy().toString(true))); } else { console.info("\tNot partitioned"); } if (desc.isColumnMapped()) { console.info("\tColumn mapping: {}", indent(desc.getColumnMapping().toString(true))); } if (!properties.isEmpty()) { StringBuilder sb = new StringBuilder(); for (String prop : properties) { sb.append("\n\t\t").append(prop).append("=") .append(desc.getProperty(prop)); } console.info("\tProperties:{}", sb.toString()); } }
@SuppressWarnings("unchecked") private static <E, V extends View<E>> V view(Dataset<E> dataset, Map<String, String> uriOptions) { if (dataset instanceof AbstractDataset) { DatasetDescriptor descriptor = dataset.getDescriptor(); Schema schema = descriptor.getSchema(); PartitionStrategy strategy = null; if (descriptor.isPartitioned()) { strategy = descriptor.getPartitionStrategy(); } Constraints constraints = Constraints.fromQueryMap( schema, strategy, uriOptions); return (V) ((AbstractDataset) dataset).filter(constraints); } else { return (V) dataset; } } }
/** * Adds configuration for {@code DatasetKeyInputFormat} to read from the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder readFrom(View<?> view) { DatasetDescriptor descriptor = view.getDataset().getDescriptor(); // if this is a partitioned dataset, add the partition location if (view instanceof FileSystemDataset) { conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation())); } // add descriptor properties to the config for (String property : descriptor.listProperties()) { conf.set(property, descriptor.getProperty(property)); } if (DataModelUtil.isGeneric(view.getType())) { Schema datasetSchema = view.getDataset().getDescriptor().getSchema(); // only set the read schema if the view is a projection if (!datasetSchema.equals(view.getSchema())) { withSchema(view.getSchema()); } } else { withType(view.getType()); } conf.set(KITE_INPUT_URI, view.getUri().toString()); return this; }
@Override public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) { Preconditions.checkArgument(DEFAULT_NAMESPACE.equals(namespace), "Non-default namespaces are not supported"); Preconditions.checkNotNull(name, "Dataset name cannot be null"); Preconditions.checkNotNull(descriptor, "Descriptor cannot be null"); Compatibility.checkAndWarn( namespace, HBaseMetadataProvider.getTableName(name), descriptor.getSchema()); Preconditions.checkArgument(descriptor.isColumnMapped(), "Cannot update dataset %s: missing column mapping", name); String tableName = getTableName(name); String entityName = getEntityName(name); schemaManager.refreshManagedSchemaCache(tableName, entityName); Schema newSchema = getEmbeddedSchema(descriptor); String schemaString = newSchema.toString(true); EntitySchema entitySchema = new AvroEntitySchema( newSchema, schemaString, descriptor.getColumnMapping()); if (!schemaManager.hasSchemaVersion(tableName, entityName, entitySchema)) { schemaManager.migrateSchema(tableName, entityName, schemaString); } else { LOG.info("Schema hasn't changed, not migrating: (" + name + ")"); } return getDatasetDescriptor(newSchema, descriptor.getLocation()); }
@Override public AvroEntitySchema parseEntitySchema(String rawSchema) { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaLiteral(rawSchema) .build(); return new AvroEntitySchema( descriptor.getSchema(), rawSchema, descriptor.getColumnMapping()); }
@Test public void testUpdatePreviousFormat() throws IOException { useOldRepositoryFormat(); DatasetDescriptor oldFormatDescriptor = provider.load(NAMESPACE, NAME); Path namedDirectory = new Path(oldFormatDescriptor.getLocation()); Path metadataDirectory = new Path(namedDirectory, ".metadata"); Path schemaDirectory = new Path(metadataDirectory, "schemas"); Path newSchemaLocation = new Path(schemaDirectory, "1.avsc"); // Performing an update against a dataset in the old location should bring it // into the new location. DatasetDescriptor updated = new DatasetDescriptor.Builder(oldFormatDescriptor).build(); provider.update(NAMESPACE, NAME, updated); Assert.assertEquals(testDescriptor.getSchema(), oldFormatDescriptor.getSchema()); Assert.assertTrue("Schema should exist at the new location.", fileSystem.exists(newSchemaLocation)); } }
public static void checkPartitionedBy(DatasetDescriptor descriptor, String fieldName) { Preconditions.checkArgument(descriptor.isPartitioned(), "Descriptor %s is not partitioned", descriptor); Preconditions.checkArgument( Accessor.getDefault().hasPartitioner(descriptor.getPartitionStrategy(), fieldName), "Descriptor %s is not partitioned by '%s'", descriptor, fieldName); }
.build(); Assert.assertTrue("Descriptor should have partition strategy", descriptor.isPartitioned()); .column("real_name", "u", "name") .build(); Assert.assertEquals(expected, descriptor.getColumnMapping());
/** * Get the {@link PartitionStrategy}, if this dataset is partitioned. Calling * this method on a non-partitioned dataset is an error. Instead, use the * {@link #isPartitioned()} method prior to invocation. */ public PartitionStrategy getPartitionStrategy() { Preconditions .checkState( isPartitioned(), "Attempt to retrieve the partition strategy on a non-partitioned descriptor:%s", this); return partitionStrategy; }
@Override protected StorageKey initialValue() { return new StorageKey(descriptor.getPartitionStrategy()); } };
public InputFormatReader(FileSystem fs, Path path, DatasetDescriptor descriptor) { this.fs = fs; this.path = path; this.descriptor = descriptor; this.state = ReaderWriterState.NEW; // set up the configuration from the descriptor properties this.conf = new Configuration(fs.getConf()); for (String prop : descriptor.listProperties()) { conf.set(prop, descriptor.getProperty(prop)); } this.attemptContext = Hadoop.TaskAttemptContext.ctor.newInstance(conf, FAKE_ID); }
/** * Checks that the {@code existing} {@link DatasetDescriptor} can be replaced * by {@code updated}. * * @param existing the current {@code DatasetDescriptor} for a dataset * @param updated a new {@code DatasetDescriptor} for the same dataset */ public static void checkUpdate(DatasetDescriptor existing, DatasetDescriptor updated) { checkNotChanged("location", existing.getLocation(), updated.getLocation()); checkCompatible(existing, updated); }
/** * Returns whether the value of the descriptor property is {@code true}. * * @param property a String property name * @param descriptor a {@link DatasetDescriptor} * @return {@code true} if set and "true", {@code false} otherwise. */ public static boolean isEnabled(String property, DatasetDescriptor descriptor) { if (descriptor.hasProperty(property)) { // return true if and only if the property value is "true" return Boolean.valueOf(descriptor.getProperty(property)); } return false; }