DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor.Builder( descriptor); configureSchema(descriptorBuilder, avroSchemaFile, avroSchemaReflectClass); try { if (columnDescriptor.exists()) { descriptorBuilder.columnMapping(columnDescriptor); } else { descriptorBuilder.columnMapping(Resources.getResource( columnDescriptorFile).openStream()); Datasets.update(uri, descriptorBuilder.build()); } else { repo.update(datasetNamespace, datasetName, descriptorBuilder.build());
private SavePolicy(Context context) { String uri = context.getString(CONFIG_KITE_ERROR_DATASET_URI); Preconditions.checkArgument(uri != null, "Must set " + CONFIG_KITE_ERROR_DATASET_URI + " when " + CONFIG_FAILURE_POLICY + "=save"); if (Datasets.exists(uri)) { dataset = Datasets.load(uri, AvroFlumeEvent.class); } else { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(AvroFlumeEvent.class) .build(); dataset = Datasets.create(uri, descriptor, AvroFlumeEvent.class); } nEventsHandled = 0; }
/** * Configure the {@link Dataset}'s schema from a String URI. A schema is * required, and can be set using one of the methods {@code schema}, * {@code schemaLiteral}, {@code schemaUri}, or * {@code schemaFromAvroDataFile}. * * @param uri a String URI * @return An instance of the builder for method chaining. * @throws IOException * * @since 0.8.0 */ public Builder schemaUri(String uri) throws IOException { return schemaUri(URI.create(uri)); }
@Test public void testTSV() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.delimiter", "\t") .property("kite.csv.lines-to-skip", "1") .schema(STRINGS) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc,
@Test public void testCreateViewWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); View<GenericRecord> view = Datasets.create(datasetUri, descriptor); verify(repo).create("ns", "test", descriptor, GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Override public int run(String[] args) throws Exception { // where the schema is stored URI schemaURI = URI.create("resource:simple-log.avsc"); // create a Parquet dataset for long-term storage Datasets.create("dataset:file:/tmp/data/logs", new DatasetDescriptor.Builder() .format(Formats.PARQUET) .schemaUri(schemaURI) .partitionStrategy(new PartitionStrategy.Builder() .year("timestamp", "year") .month("timestamp", "month") .day("timestamp", "day") .build()) .build(), Record.class); // create an Avro dataset to temporarily hold data Datasets.create("dataset:file:/tmp/data/logs_staging", new DatasetDescriptor.Builder() .format(Formats.AVRO) .schemaUri(schemaURI) .partitionStrategy(new PartitionStrategy.Builder() .day("timestamp", "day") .build()) .build(), Record.class); return 0; }
@Test public void testLoadView() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<Object> ds = mock(AbstractDataset.class); when(repo.load("ns", "test", Object.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<Object> view = Datasets.<Object, RefinableView<Object>> load(datasetUri, Object.class); verify(repo).load("ns", "test", Object.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testCreateView() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<Object> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, Object.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<Object> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<Object> view = Datasets.<Object, RefinableView<Object>> create(datasetUri, descriptor, Object.class); verify(repo).create("ns", "test", descriptor, Object.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testKeyMappingSourceMustBeIdentityPartitioned() { // and it works when the field is present Assert.assertNotNull(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 16) .identity("id") .build()) .columnMapping(new ColumnMapping.Builder() .key("id") .build()) .build()); TestHelpers.assertThrows("Should reject mapping source not id partitioned", ValidationException.class, new Runnable() { @Override public void run() { new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 16) .build()) .columnMapping(new ColumnMapping.Builder() .key("id") .build()) .build(); } } ); }
@Test public void testLoadViewWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.load("ns", "test", GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<GenericRecord> view = Datasets.load(datasetUri); verify(repo).load("ns", "test", GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Test public void testCreateViewStringUriWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.create("ns", "test", descriptor, GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); View<GenericRecord> view = Datasets.create(datasetUri.toString(), descriptor); verify(repo).create("ns", "test", descriptor, GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
@Override public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) { Compatibility.checkDatasetName(namespace, name); Compatibility.checkDescriptor(descriptor); String resolved = resolveNamespace(namespace, name); if (resolved != null) { Table table = getMetaStoreUtil().getTable(resolved, name); Path managerPath = new Path(new Path(table.getSd().getLocation()), SCHEMA_DIRECTORY); SchemaManager manager = SchemaManager.create(conf, managerPath); DatasetDescriptor newDescriptor; try { URI schemaURI = manager.writeSchema(descriptor.getSchema()); newDescriptor = new DatasetDescriptor.Builder(descriptor) .schemaUri(schemaURI).build(); } catch (IOException e) { throw new DatasetIOException("Unable to create schema", e); } HiveUtils.updateTableSchema(table, newDescriptor); getMetaStoreUtil().alterTable(table); return descriptor; } throw new DatasetNotFoundException( "Hive table not found: " + namespace + "." + name); }
@Test public void testLoadViewStringUriWithoutType() throws Exception { DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:schema/user.avsc") .build(); Constraints constraints = new Constraints(descriptor.getSchema(), null) .with("username", "user1") .with("email", "user1@example.com"); AbstractDataset<GenericRecord> ds = mock(AbstractDataset.class); when(repo.load("ns", "test", GenericRecord.class)).thenReturn(ds); when(ds.getDescriptor()).thenReturn(descriptor); AbstractRefinableView<GenericRecord> userAndEmailView = mock(AbstractRefinableView.class); when(ds.filter(constraints)).thenReturn(userAndEmailView); URI datasetUri = new URIBuilder(repoUri, "ns", "test") .with("username", "user1") .with("email", "user1@example.com") .with("ignoredOption", "abc") .build(); RefinableView<GenericRecord> view = Datasets.load(datasetUri.toString()); verify(repo).load("ns", "test", GenericRecord.class); verifyNoMoreInteractions(repo); verify(ds).getDescriptor(); verify(ds).filter(constraints); verifyNoMoreInteractions(ds); verifyNoMoreInteractions(userAndEmailView); Assert.assertEquals(userAndEmailView, view); }
/** * Configure the dataset's column mappings from a URI. * * @param uri * A URI to a column mapping JSON file * @return This builder for method chaining * @throws ValidationException * If the literal is not valid JSON-encoded column mappings * @throws java.io.IOException * If accessing the URI results in an IOException * * @since 0.14.0 */ public Builder columnMappingUri(URI uri) throws IOException { InputStream in = null; boolean threw = true; try { in = open(uri); columnMapping(in); threw = false; } finally { Closeables.close(in, threw); } return this; }
@Override public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) { Preconditions.checkNotNull(namespace, "Namespace cannot be null"); Preconditions.checkNotNull(name, "Name cannot be null"); Preconditions.checkNotNull(descriptor, "Descriptor cannot be null"); if (exists(namespace, name)) { throw new DatasetExistsException( "Dataset already exists for name:" + name); } DatasetDescriptor newDescriptor; if (descriptor.getLocation() == null) { newDescriptor = new DatasetDescriptor.Builder(descriptor) .location(fs.makeQualified(new Path(newLocation(name)))) .build(); } else { // don't need to modify it newDescriptor = descriptor; } // save and return if (!descriptors.containsKey(namespace)) { descriptors.put(namespace, Maps.<String, DatasetDescriptor>newHashMap()); } Map<String, DatasetDescriptor> datasets = descriptors.get(namespace); datasets.put(name, newDescriptor); return newDescriptor; }
@Test @SuppressWarnings("deprecation") public void testWriteToSubpartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); PartitionKey key = new PartitionKey(1); FileSystemDataset<Record> userPartition = (FileSystemDataset<Record>) ds.getPartition(key, true); Assert.assertEquals(key, userPartition.getPartitionKey()); writeTestUsers(userPartition, 1); Assert.assertTrue("Partitioned directory exists", fileSystem.exists(new Path(testDirectory, "username_part=1/email_hash=2"))); Assert.assertEquals(1, readTestUsersInPartition(ds, key, "email_hash")); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentFormats() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.AVRO) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.PARQUET) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Override public DatasetDescriptor update(String namespace, String name, DatasetDescriptor descriptor) { Compatibility.checkDatasetName(namespace, name); Compatibility.checkDescriptor(descriptor); String resolved = resolveNamespace(namespace, name); if (resolved != null) { Table table = getMetaStoreUtil().getTable(resolved, name); Path managerPath = new Path(new Path(table.getSd().getLocation()), SCHEMA_DIRECTORY); SchemaManager manager = SchemaManager.create(conf, managerPath); DatasetDescriptor newDescriptor; try { URI schemaURI = manager.writeSchema(descriptor.getSchema()); newDescriptor = new DatasetDescriptor.Builder(descriptor) .schemaUri(schemaURI).build(); } catch (IOException e) { throw new DatasetIOException("Unable to create schema", e); } HiveUtils.updateTableSchema(table, newDescriptor); getMetaStoreUtil().alterTable(table); return descriptor; } throw new DatasetNotFoundException( "Hive table not found: " + namespace + "." + name); }
@Before public void setup() throws Exception { this.conf = (distributed ? MiniDFSTest.getConfiguration() : new Configuration()); this.fs = FileSystem.get(conf); this.trashPolicy = TrashPolicy.getInstance(conf, fs, fs.getHomeDirectory()); this.repo = newRepo(); this.strategy = new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .build(); this.testDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); repo.delete("ns", "test"); this.unbounded = repo.create("ns", "test", testDescriptor); this.valueDescriptor = new DatasetDescriptor.Builder().schemaUri("resource:value.avsc").build(); repo.delete("ns", "value"); this.valueView = repo.create("ns", "value", valueDescriptor); this.testValueView = repo.load("ns", "value", TestValue.class); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .property("kite.writer.cache-size", "20") .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); // create a second dataset with the same partitioning for replacement parts temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); writeTestRecords(temporary); }