org.kitesdk.data.DatasetDescriptor$Builder java code examples

DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder(
  descriptor);
configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass);
 try {
  if (columnDescriptor.exists()) {
   descriptorBuilder.columnMapping(columnDescriptor);
  } else {
   descriptorBuilder.columnMapping(Resources.getResource(
     columnDescriptorFile).openStream());
 Datasets.update(uri, descriptorBuilder.build());
} else {
 repo.update(datasetNamespace, datasetName, descriptorBuilder.build());

private SavePolicy(Context context) {
 String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI);
 Preconditions.checkArgument(uri != null, "Must set "
   + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY
   + "=save");
 if (Datasets.exists(uri)) {
  dataset = Datasets.load(uri, AvroFlumeEvent.class);
 } else {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
    .schema(AvroFlumeEvent.class)
    .build();
  dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class);
 }
 nEventsHandled = 0;
}

/**
 * Configure the {@link Dataset}'s schema from a String URI. A schema is
 * required, and can be set using one of the methods {@code schema},
 * {@code schemaLiteral}, {@code schemaUri}, or
 * {@code schemaFromAvroDataFile}.
 *
 * @param uri a String URI
 * @return An instance of the builder for method chaining.
 * @throws IOException
 *
 * @since 0.8.0
 */
public Builder schemaUri(String uri) throws IOException {
 return schemaUri(URI.create(uri));
}

@Test
public void testTSV() {
 final DatasetDescriptor desc = new DatasetDescriptor.Builder()
   .property("kite.csv.delimiter", "\t")
   .property("kite.csv.lines-to-skip", "1")
   .schema(STRINGS)
   .build();
 final CSVFileReader<GenericData.Record> reader =
   new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc,

@Test
public void testCreateViewWithoutType() throws Exception {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:schema/user.avsc")
   .build();
 Constraints constraints = new Constraints(descriptor.getSchema(), null)
   .with("username", "user1")
   .with("email", "user1@example.com");
 AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class);
 when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(ds);
 when(ds.getDescriptor()).thenReturn(descriptor);
 AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class);
 when(ds.filter(constraints)).thenReturn(userAndEmailView);
 URI datasetUri = new URIBuilder(repoUri, "ns", "test")
   .with("username", "user1")
   .with("email", "user1@example.com")
   .with("ignoredOption", "abc")
   .build();
 View<GenericRecord> view = Datasets.create(datasetUri, descriptor);
 verify(repo).create("ns", "test", descriptor, GenericRecord.class);
 verifyNoMoreInteractions(repo);
 verify(ds).getDescriptor();
 verify(ds).filter(constraints);
 verifyNoMoreInteractions(ds);
 verifyNoMoreInteractions(userAndEmailView);
 Assert.assertEquals(userAndEmailView, view);
}

@Override
public int run(String[] args) throws Exception {
 // where the schema is stored
 URI schemaURI = URI.create("resource:simple-log.avsc");
 // create a Parquet dataset for long-term storage
 Datasets.create("dataset:file:/tmp/data/logs",
   new DatasetDescriptor.Builder()
     .format(Formats.PARQUET)
     .schemaUri(schemaURI)
     .partitionStrategy(new PartitionStrategy.Builder()
       .year("timestamp", "year")
       .month("timestamp", "month")
       .day("timestamp", "day")
       .build())
     .build(), Record.class);
 // create an Avro dataset to temporarily hold data
 Datasets.create("dataset:file:/tmp/data/logs_staging",
   new DatasetDescriptor.Builder()
     .format(Formats.AVRO)
     .schemaUri(schemaURI)
     .partitionStrategy(new PartitionStrategy.Builder()
       .day("timestamp", "day")
       .build())
     .build(), Record.class);
 return 0;
}

@Test
public void testLoadView() throws Exception {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:schema/user.avsc")
   .build();
 Constraints constraints = new Constraints(descriptor.getSchema(), null)
   .with("username", "user1")
   .with("email", "user1@example.com");
 AbstractDataset<Object> ds = mock(AbstractDataset.class);
 when(repo.load("ns", "test", Object.class)).thenReturn(ds);
 when(ds.getDescriptor()).thenReturn(descriptor);
 AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class);
 when(ds.filter(constraints)).thenReturn(userAndEmailView);
 URI datasetUri = new URIBuilder(repoUri, "ns", "test")
   .with("username", "user1")
   .with("email", "user1@example.com")
   .with("ignoredOption", "abc")
   .build();
 RefinableView<Object> view = Datasets.<Object, RefinableView<Object>>
   load(datasetUri, Object.class);
 verify(repo).load("ns", "test", Object.class);
 verifyNoMoreInteractions(repo);
 verify(ds).getDescriptor();
 verify(ds).filter(constraints);
 verifyNoMoreInteractions(ds);
 verifyNoMoreInteractions(userAndEmailView);
 Assert.assertEquals(userAndEmailView, view);
}

@Test
public void testCreateView() throws Exception {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:schema/user.avsc")
   .build();
 Constraints constraints = new Constraints(descriptor.getSchema(), null)
   .with("username", "user1")
   .with("email", "user1@example.com");
 AbstractDataset<Object> ds = mock(AbstractDataset.class);
 when(repo.create("ns", "test", descriptor, Object.class)).thenReturn(ds);
 when(ds.getDescriptor()).thenReturn(descriptor);
 AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class);
 when(ds.filter(constraints)).thenReturn(userAndEmailView);
 URI datasetUri = new URIBuilder(repoUri, "ns", "test")
   .with("username", "user1")
   .with("email", "user1@example.com")
   .with("ignoredOption", "abc")
   .build();
 RefinableView<Object> view = Datasets.<Object, RefinableView<Object>>
   create(datasetUri, descriptor, Object.class);
 verify(repo).create("ns", "test", descriptor, Object.class);
 verifyNoMoreInteractions(repo);
 verify(ds).getDescriptor();
 verify(ds).filter(constraints);
 verifyNoMoreInteractions(ds);
 verifyNoMoreInteractions(userAndEmailView);
 Assert.assertEquals(userAndEmailView, view);
}

@Test
public void testKeyMappingSourceMustBeIdentityPartitioned() {
 // and it works when the field is present
 Assert.assertNotNull(new DatasetDescriptor.Builder()
   .schema(USER_SCHEMA)
   .partitionStrategy(new PartitionStrategy.Builder()
     .hash("id", 16)
     .identity("id")
     .build())
   .columnMapping(new ColumnMapping.Builder()
     .key("id")
     .build())
   .build());
 TestHelpers.assertThrows("Should reject mapping source not id partitioned",
   ValidationException.class, new Runnable() {
    @Override
    public void run() {
     new DatasetDescriptor.Builder()
       .schema(USER_SCHEMA)
       .partitionStrategy(new PartitionStrategy.Builder()
         .hash("id", 16)
         .build())
       .columnMapping(new ColumnMapping.Builder()
         .key("id")
         .build())
       .build();
    }
   }
 );
}

@Test
public void testLoadViewWithoutType() throws Exception {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:schema/user.avsc")
   .build();
 Constraints constraints = new Constraints(descriptor.getSchema(), null)
   .with("username", "user1")
   .with("email", "user1@example.com");
 AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class);
 when(repo.load("ns", "test", GenericRecord.class)).thenReturn(ds);
 when(ds.getDescriptor()).thenReturn(descriptor);
 AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class);
 when(ds.filter(constraints)).thenReturn(userAndEmailView);
 URI datasetUri = new URIBuilder(repoUri, "ns", "test")
   .with("username", "user1")
   .with("email", "user1@example.com")
   .with("ignoredOption", "abc")
   .build();
 RefinableView<GenericRecord> view = Datasets.load(datasetUri);
 verify(repo).load("ns", "test", GenericRecord.class);
 verifyNoMoreInteractions(repo);
 verify(ds).getDescriptor();
 verify(ds).filter(constraints);
 verifyNoMoreInteractions(ds);
 verifyNoMoreInteractions(userAndEmailView);
 Assert.assertEquals(userAndEmailView, view);
}

@Test
public void testCreateViewStringUriWithoutType() throws Exception {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:schema/user.avsc")
   .build();
 Constraints constraints = new Constraints(descriptor.getSchema(), null)
   .with("username", "user1")
   .with("email", "user1@example.com");
 AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class);
 when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(ds);
 when(ds.getDescriptor()).thenReturn(descriptor);
 AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class);
 when(ds.filter(constraints)).thenReturn(userAndEmailView);
 URI datasetUri = new URIBuilder(repoUri, "ns", "test")
   .with("username", "user1")
   .with("email", "user1@example.com")
   .with("ignoredOption", "abc")
   .build();
 View<GenericRecord> view = Datasets.create(datasetUri.toString(), descriptor);
 verify(repo).create("ns", "test", descriptor, GenericRecord.class);
 verifyNoMoreInteractions(repo);
 verify(ds).getDescriptor();
 verify(ds).filter(constraints);
 verifyNoMoreInteractions(ds);
 verifyNoMoreInteractions(userAndEmailView);
 Assert.assertEquals(userAndEmailView, view);
}

@Override
public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) {
 Compatibility.checkDatasetName(namespace, name);
 Compatibility.checkDescriptor(descriptor);
 String resolved = resolveNamespace(namespace, name);
 if (resolved != null) {
  Table table = getMetaStoreUtil().getTable(resolved, name);
  Path managerPath = new Path(new Path(table.getSd().getLocation()),
    SCHEMA_DIRECTORY);
  SchemaManager manager = SchemaManager.create(conf, managerPath);
  DatasetDescriptor newDescriptor;
  try {
   URI schemaURI = manager.writeSchema(descriptor.getSchema());
   newDescriptor = new DatasetDescriptor.Builder(descriptor)
     .schemaUri(schemaURI).build();
  } catch (IOException e) {
   throw new DatasetIOException("Unable to create schema", e);
  }
  HiveUtils.updateTableSchema(table, newDescriptor);
  getMetaStoreUtil().alterTable(table);
  return descriptor;
 }
 throw new DatasetNotFoundException(
   "Hive table not found: " + namespace + "." + name);
}

@Test
public void testLoadViewStringUriWithoutType() throws Exception {
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:schema/user.avsc")
   .build();
 Constraints constraints = new Constraints(descriptor.getSchema(), null)
   .with("username", "user1")
   .with("email", "user1@example.com");
 AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class);
 when(repo.load("ns", "test", GenericRecord.class)).thenReturn(ds);
 when(ds.getDescriptor()).thenReturn(descriptor);
 AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class);
 when(ds.filter(constraints)).thenReturn(userAndEmailView);
 URI datasetUri = new URIBuilder(repoUri, "ns", "test")
   .with("username", "user1")
   .with("email", "user1@example.com")
   .with("ignoredOption", "abc")
   .build();
 RefinableView<GenericRecord> view = Datasets.load(datasetUri.toString());
 verify(repo).load("ns", "test", GenericRecord.class);
 verifyNoMoreInteractions(repo);
 verify(ds).getDescriptor();
 verify(ds).filter(constraints);
 verifyNoMoreInteractions(ds);
 verifyNoMoreInteractions(userAndEmailView);
 Assert.assertEquals(userAndEmailView, view);
}

/**
 * Configure the dataset's column mappings from a URI.
 *
 * @param uri
 *          A URI to a column mapping JSON file
 * @return This builder for method chaining
 * @throws ValidationException
 *          If the literal is not valid JSON-encoded column mappings
 * @throws java.io.IOException
 *          If accessing the URI results in an IOException
 *
 * @since 0.14.0
 */
public Builder columnMappingUri(URI uri) throws IOException {
 InputStream in = null;
 boolean threw = true;
 try {
  in = open(uri);
  columnMapping(in);
  threw = false;
 } finally {
  Closeables.close(in, threw);
 }
 return this;
}

@Override
public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) {
 Preconditions.checkNotNull(namespace, "Namespace cannot be null");
 Preconditions.checkNotNull(name, "Name cannot be null");
 Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");
 if (exists(namespace, name)) {
  throw new DatasetExistsException(
    "Dataset already exists for name:" + name);
 }
 DatasetDescriptor newDescriptor;
 if (descriptor.getLocation() == null) {
  newDescriptor = new DatasetDescriptor.Builder(descriptor)
    .location(fs.makeQualified(new Path(newLocation(name))))
    .build();
 } else {
  // don't need to modify it
  newDescriptor = descriptor;
 }
 // save and return
 if (!descriptors.containsKey(namespace)) {
  descriptors.put(namespace, Maps.<String, DatasetDescriptor>newHashMap());
 }
 Map<String, DatasetDescriptor> datasets = descriptors.get(namespace);
 datasets.put(name, newDescriptor);
 return newDescriptor;
}

@Test
@SuppressWarnings("deprecation")
public void testWriteToSubpartition() throws IOException {
 PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
  .hash("username", "username_part", 2).hash("email", 3).build();
 FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("partitioned-users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .format(format)
     .compressionType(compressionType)
     .location(testDirectory)
     .partitionStrategy(partitionStrategy)
     .build())
   .type(Record.class)
   .build();
 PartitionKey key = new PartitionKey(1);
 FileSystemDataset<Record> userPartition = (FileSystemDataset<Record>) ds.getPartition(key, true);
 Assert.assertEquals(key, userPartition.getPartitionKey());
 writeTestUsers(userPartition, 1);
 Assert.assertTrue("Partitioned directory exists",
  fileSystem.exists(new Path(testDirectory, "username_part=1/email_hash=2")));
 Assert.assertEquals(1, readTestUsersInPartition(ds, key, "email_hash"));
}

@Test(expected = ValidationException.class)
public void testCannotMergeDatasetsWithDifferentFormats() throws IOException {
 FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .format(Formats.AVRO)
     .location(testDirectory)
     .build())
   .type(Record.class)
   .build();
 FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>()
   .namespace("ns")
   .name("users")
   .configuration(getConfiguration())
   .descriptor(new DatasetDescriptor.Builder()
     .schema(USER_SCHEMA)
     .format(Formats.PARQUET)
     .location(testDirectory)
     .build())
   .type(Record.class)
   .build();
 ds.merge(dsUpdate);
}

@Override
public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) {
 Compatibility.checkDatasetName(namespace, name);
 Compatibility.checkDescriptor(descriptor);
 String resolved = resolveNamespace(namespace, name);
 if (resolved != null) {
  Table table = getMetaStoreUtil().getTable(resolved, name);
  Path managerPath = new Path(new Path(table.getSd().getLocation()),
    SCHEMA_DIRECTORY);
  SchemaManager manager = SchemaManager.create(conf, managerPath);
  DatasetDescriptor newDescriptor;
  try {
   URI schemaURI = manager.writeSchema(descriptor.getSchema());
   newDescriptor = new DatasetDescriptor.Builder(descriptor)
     .schemaUri(schemaURI).build();
  } catch (IOException e) {
   throw new DatasetIOException("Unable to create schema", e);
  }
  HiveUtils.updateTableSchema(table, newDescriptor);
  getMetaStoreUtil().alterTable(table);
  return descriptor;
 }
 throw new DatasetNotFoundException(
   "Hive table not found: " + namespace + "." + name);
}

@Before
public void setup() throws Exception {
 this.conf = (distributed ?
   MiniDFSTest.getConfiguration() :
   new Configuration());
 this.fs = FileSystem.get(conf);
 this.trashPolicy = TrashPolicy.getInstance(conf, fs, fs.getHomeDirectory());
 this.repo = newRepo();
 this.strategy = new PartitionStrategy.Builder()
   .year("timestamp")
   .month("timestamp")
   .day("timestamp")
   .build();
 this.testDescriptor = new DatasetDescriptor.Builder()
   .schemaUri("resource:standard_event.avsc")
   .partitionStrategy(strategy)
   .build();
 repo.delete("ns", "test");
 this.unbounded = repo.create("ns", "test", testDescriptor);
 
 this.valueDescriptor = new DatasetDescriptor.Builder().schemaUri("resource:value.avsc").build();
 repo.delete("ns", "value");
 this.valueView = repo.create("ns", "value", valueDescriptor); 
 this.testValueView = repo.load("ns", "value", TestValue.class);
}

@Before
public void createTestDatasets() {
 Datasets.delete("dataset:file:/tmp/datasets/unpartitioned");
 Datasets.delete("dataset:file:/tmp/datasets/partitioned");
 Datasets.delete("dataset:file:/tmp/datasets/temporary");
 DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
   .schema(TestRecord.class)
   .build();
 unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned",
   descriptor, TestRecord.class);
 descriptor = new DatasetDescriptor.Builder(descriptor)
   .property("kite.writer.cache-size", "20")
   .partitionStrategy(new PartitionStrategy.Builder()
     .hash("id", 4)
     .build())
   .build();
 partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned",
   descriptor, TestRecord.class);
 // create a second dataset with the same partitioning for replacement parts
 temporary = Datasets.create("dataset:file:/tmp/datasets/temporary",
   descriptor, TestRecord.class);
 writeTestRecords(unpartitioned);
 writeTestRecords(partitioned);
 writeTestRecords(temporary);
}

Javadoc

A fluent builder to aid in the construction of DatasetDescriptors.

Most used methods

Popular in Java

Updating database using SQL prepared statement
onCreateOptionsMenu (Activity)
getSharedPreferences (Context)
compareTo (BigDecimal)
HttpServer (com.sun.net.httpserver)
This class implements a simple HTTP server. A HttpServer is bound to an IP address and port number a
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
Permission (java.security)
Legacy security code; do not use.
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Top Vim plugins

How to useDatasetDescriptor$Builder in org.kitesdk.data

Best Java code snippets using org.kitesdk.data.DatasetDescriptor$Builder (Showing top 20 results out of 315)

How to use
DatasetDescriptor$Builder
in
org.kitesdk.data