org.kitesdk.data.DatasetDescriptor java code examples

  return Datasets.load(uri).getDataset().getDescriptor().getSchema();
} else if ("resource".equals(uri.getScheme())) {
  try (InputStream in = Resources.getResource(uri.getSchemeSpecificPart()).openStream()) {

Format format = descriptor.getFormat();
Preconditions.checkArgument(allowedFormats().contains(format.getName()),
  "Unsupported format: " + format.getName());
Schema newSchema = descriptor.getSchema();
if (datasetSchema == null || !newSchema.equals(datasetSchema)) {
 this.datasetSchema = descriptor.getSchema();

@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaLiteral(rawSchema)
   .build();
 return new AvroKeySchema(
   descriptor.getSchema(), descriptor.getPartitionStrategy());
}

try {
 outputStream = fs.create(schemaPath, true /* overwrite */ );
 outputStream.write(descriptor.getSchema().toString(true)
   .getBytes(Charsets.UTF_8));
 outputStream.flush();
  new Path(metadataLocation, SCHEMA_DIRECTORY_NAME));
manager.writeSchema(descriptor.getSchema());
properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName());
properties.setProperty(COMPRESSION_TYPE_FIELD_NAME, descriptor.getCompressionType().getName());
final URI dataLocation = descriptor.getLocation();
if (dataLocation != null) {
 properties.setProperty(LOCATION_FIELD_NAME, dataLocation.toString());
if (descriptor.isPartitioned()) {
 properties.setProperty(PARTITION_EXPRESSION_FIELD_NAME,
   Accessor.getDefault().toExpression(descriptor.getPartitionStrategy()));
for (String property : descriptor.listProperties()) {
 properties.setProperty(property, descriptor.getProperty(property));

private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) {
 // the SchemaManager stores schemas, so this embeds the column mapping and
 // partition strategy in the schema. the result is parsed by
 // AvroKeyEntitySchemaParser
 Schema schema = descriptor.getSchema();
 if (descriptor.isColumnMapped()) {
  schema = ColumnMappingParser
    .embedColumnMapping(schema, descriptor.getColumnMapping());
 }
 if (descriptor.isPartitioned()) {
  schema = PartitionStrategyParser
    .embedPartitionStrategy(schema, descriptor.getPartitionStrategy());
 }
 return schema;
}

@Test
public void testMultipleAvroFilesInOneFolder() throws Exception {
 File folder = temp.newFolder("a/b/c/d/e");
 Path root = new Path(temp.getRoot().toURI());
 FileSystem fs = LocalFileSystem.getInstance();
 // create a two Avro files in parent
 Path parent = new Path(folder.toURI());
 createAvroUserFile(fs, parent);
 createAvroUserFile(fs, parent);
 DatasetDescriptor descriptor = Iterables.getOnlyElement(
   FileSystemUtil.findPotentialDatasets(fs, root));
 Assert.assertFalse("Should not flag at mixed depth",
   descriptor.hasProperty("kite.filesystem.mixed-depth"));
 Assert.assertEquals("Should be directly under parent",
   parent.toUri(), descriptor.getLocation());
 Assert.assertEquals("Should use user schema",
   USER_SCHEMA, descriptor.getSchema());
 Assert.assertEquals("Should have Avro format",
   Formats.AVRO, descriptor.getFormat());
 Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}

@Test
public void testLoad() {
 ensureCreated();
 DatasetDescriptor loaded = provider.load(NAMESPACE, NAME);
 Assert.assertNotNull("DatasetDescriptor should be returned", loaded);
 Assert.assertEquals("Schema should match",
   testDescriptor.getSchema(), loaded.getSchema());
 Assert.assertEquals("PartitionStrategy should match",
   testDescriptor.getPartitionStrategy(), loaded.getPartitionStrategy());
 Assert.assertEquals("Format should match",
   testDescriptor.getFormat(), loaded.getFormat());
}

private static void printInfo(Logger console, Dataset<?> dataset) {
 DatasetDescriptor desc = dataset.getDescriptor();
 String schema = ColumnMappingParser.removeEmbeddedMapping(
   PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema()))
   .toString(true);
 Collection<String> properties = desc.listProperties();
 console.info("\nDataset \"{}\":", dataset.getName());
 console.info("\tURI: \"{}\"", dataset.getUri());
 console.info("\tSchema: {}", indent(schema));
 if (desc.isPartitioned()) {
  console.info("\tPartition strategy: {}",
    indent(desc.getPartitionStrategy().toString(true)));
 } else {
  console.info("\tNot partitioned");
 }
 if (desc.isColumnMapped()) {
  console.info("\tColumn mapping: {}",
    indent(desc.getColumnMapping().toString(true)));
 }
 if (!properties.isEmpty()) {
  StringBuilder sb = new StringBuilder();
  for (String prop : properties) {
   sb.append("\n\t\t").append(prop).append("=")
     .append(desc.getProperty(prop));
  }
  console.info("\tProperties:{}", sb.toString());
 }
}

 @SuppressWarnings("unchecked")
 private static <E, V extends View<E>> V view(Dataset<E> dataset,
                        Map<String, String> uriOptions) {
  if (dataset instanceof AbstractDataset) {
   DatasetDescriptor descriptor = dataset.getDescriptor();
   Schema schema = descriptor.getSchema();
   PartitionStrategy strategy = null;
   if (descriptor.isPartitioned()) {
    strategy = descriptor.getPartitionStrategy();
   }
   Constraints constraints = Constraints.fromQueryMap(
     schema, strategy, uriOptions);
   return (V) ((AbstractDataset) dataset).filter(constraints);
  } else {
   return (V) dataset;
  }
 }
}

/**
 * Adds configuration for {@code DatasetKeyInputFormat} to read from the
 * given {@link Dataset} or {@link View} instance.
 *
 * @param view a dataset or view
 * @return this for method chaining
 */
public ConfigBuilder readFrom(View<?> view) {
 DatasetDescriptor descriptor = view.getDataset().getDescriptor();
 // if this is a partitioned dataset, add the partition location
 if (view instanceof FileSystemDataset) {
  conf.set(KITE_PARTITION_DIR, String.valueOf(descriptor.getLocation()));
 }
 // add descriptor properties to the config
 for (String property : descriptor.listProperties()) {
  conf.set(property, descriptor.getProperty(property));
 }
 if (DataModelUtil.isGeneric(view.getType())) {
  Schema datasetSchema = view.getDataset().getDescriptor().getSchema();
  // only set the read schema if the view is a projection
  if (!datasetSchema.equals(view.getSchema())) {
   withSchema(view.getSchema());
  }
 } else {
  withType(view.getType());
 }
 conf.set(KITE_INPUT_URI, view.getUri().toString());
 return this;
}

@Override
public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) {
 Preconditions.checkArgument(DEFAULT_NAMESPACE.equals(namespace),
   "Non-default namespaces are not supported");
 Preconditions.checkNotNull(name, "Dataset name cannot be null");
 Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
 Compatibility.checkAndWarn(
   namespace,
   HBaseMetadataProvider.getTableName(name),
   descriptor.getSchema());
 Preconditions.checkArgument(descriptor.isColumnMapped(),
   "Cannot update dataset %s: missing column mapping", name);
 String tableName = getTableName(name);
 String entityName = getEntityName(name);
 schemaManager.refreshManagedSchemaCache(tableName, entityName);
 Schema newSchema = getEmbeddedSchema(descriptor);
 String schemaString = newSchema.toString(true);
 EntitySchema entitySchema = new AvroEntitySchema(
   newSchema, schemaString, descriptor.getColumnMapping());
 if (!schemaManager.hasSchemaVersion(tableName, entityName, entitySchema)) {
  schemaManager.migrateSchema(tableName, entityName, schemaString);
 } else {
  LOG.info("Schema hasn't changed, not migrating: (" + name + ")");
 }
 return getDatasetDescriptor(newSchema, descriptor.getLocation());
}

@Override
public AvroEntitySchema parseEntitySchema(String rawSchema) {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaLiteral(rawSchema)
   .build();
 return new AvroEntitySchema(
   descriptor.getSchema(), rawSchema, descriptor.getColumnMapping());
}

 @Test
 public void testUpdatePreviousFormat() throws IOException {

  useOldRepositoryFormat();

  DatasetDescriptor oldFormatDescriptor = provider.load(NAMESPACE, NAME);

  Path namedDirectory = new Path(oldFormatDescriptor.getLocation());
  Path metadataDirectory = new Path(namedDirectory, ".metadata");
  Path schemaDirectory = new Path(metadataDirectory, "schemas");
  Path newSchemaLocation = new Path(schemaDirectory, "1.avsc");

  // Performing an update against a dataset in the old location should bring it
  // into the new location.
  DatasetDescriptor updated =  new DatasetDescriptor.Builder(oldFormatDescriptor).build();

  provider.update(NAMESPACE, NAME, updated);

  Assert.assertEquals(testDescriptor.getSchema(), oldFormatDescriptor.getSchema());

  Assert.assertTrue("Schema should exist at the new location.",
    fileSystem.exists(newSchemaLocation));
 }
}

public static void checkPartitionedBy(DatasetDescriptor descriptor,
                   String fieldName) {
 Preconditions.checkArgument(descriptor.isPartitioned(),
   "Descriptor %s is not partitioned", descriptor);
 Preconditions.checkArgument(
   Accessor.getDefault().hasPartitioner(descriptor.getPartitionStrategy(), fieldName),
   "Descriptor %s is not partitioned by '%s'", descriptor, fieldName);
}

  .build();
Assert.assertTrue("Descriptor should have partition strategy",
  descriptor.isPartitioned());
  .column("real_name", "u", "name")
  .build();
Assert.assertEquals(expected, descriptor.getColumnMapping());

/**
 * Get the {@link PartitionStrategy}, if this dataset is partitioned. Calling
 * this method on a non-partitioned dataset is an error. Instead, use the
 * {@link #isPartitioned()} method prior to invocation.
 */
public PartitionStrategy getPartitionStrategy() {
 Preconditions
   .checkState(
     isPartitioned(),
     "Attempt to retrieve the partition strategy on a non-partitioned descriptor:%s",
     this);
 return partitionStrategy;
}

 @Override
 protected StorageKey initialValue() {
  return new StorageKey(descriptor.getPartitionStrategy());
 }
};

public InputFormatReader(FileSystem fs, Path path, DatasetDescriptor descriptor) {
 this.fs = fs;
 this.path = path;
 this.descriptor = descriptor;
 this.state = ReaderWriterState.NEW;
 // set up the configuration from the descriptor properties
 this.conf = new Configuration(fs.getConf());
 for (String prop : descriptor.listProperties()) {
  conf.set(prop, descriptor.getProperty(prop));
 }
 this.attemptContext = Hadoop.TaskAttemptContext.ctor.newInstance(conf, FAKE_ID);
}

/**
 * Checks that the {@code existing} {@link DatasetDescriptor} can be replaced
 * by {@code updated}.
 *
 * @param existing the current {@code DatasetDescriptor} for a dataset
 * @param updated a new {@code DatasetDescriptor} for the same dataset
 */
public static void checkUpdate(DatasetDescriptor existing,
                DatasetDescriptor updated) {
 checkNotChanged("location", existing.getLocation(), updated.getLocation());
 checkCompatible(existing, updated);
}

/**
 * Returns whether the value of the descriptor property is {@code true}.
 *
 * @param property a String property name
 * @param descriptor a {@link DatasetDescriptor}
 * @return {@code true} if set and "true", {@code false} otherwise.
 */
public static boolean isEnabled(String property, DatasetDescriptor descriptor) {
 if (descriptor.hasProperty(property)) {
  // return true if and only if the property value is "true"
  return Boolean.valueOf(descriptor.getProperty(property));
 }
 return false;
}

Javadoc

The structural definition of a Dataset.

Each Dataset has an associated Schema and optional PartitionStrategy defined at the time of creation. You use instances of this class to hold this information. You are strongly encouraged to use the inner Builder to create new instances.

Most used methods

Popular in Java

Reactive rest calls using spring rest template
getSupportFragmentManager (FragmentActivity)
getApplicationContext (Context)
setContentView (Activity)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
Github Copilot alternatives

How to useDatasetDescriptor in org.kitesdk.data

Best Java code snippets using org.kitesdk.data.DatasetDescriptor (Showing top 20 results out of 315)

How to use
DatasetDescriptor
in
org.kitesdk.data