.hash("username", 2).hash("email", 3).build(); final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build();
.getSubpartitionStrategy(partitionStrategy, partitionDepth); return new FileSystemDataset.Builder<E>() .namespace(namespace) .name(name) .fileSystem(fileSystem) .uri(uri) .descriptor(new DatasetDescriptor.Builder(descriptor) .location(partitionDirectory) .partitionStrategy(subpartitionStrategy) .build()) .type(type) .partitionKey(key) .partitionListener(partitionListener) .build();
.hash("username", 2).hash("email", 3).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build();
"username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_NULLABLE_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build();
"username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build();
"username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build();
.hash("username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build();
PartitionStrategy subPartitionStrategy = Accessor.getDefault() .getSubpartitionStrategy(partitionStrategy, 1); Builder<E> builder = new FileSystemDataset.Builder<E>() .namespace(namespace) .name(name) .fileSystem(fileSystem) .uri(uri) .descriptor(new DatasetDescriptor.Builder(descriptor) .location(p) .partitionStrategy(subPartitionStrategy) .build()) .type(type) .partitionKey(key) .partitionListener(partitionListener); partitions.add(builder.build());
name, newDescriptor.getSchema(), newDescriptor.getLocation() }); FileSystemDataset<E> dataset = new FileSystemDataset.Builder<E>() .namespace(namespace) .name(name) .configuration(conf) .descriptor(newDescriptor) .type(type) .uri(new URIBuilder(getUri(), namespace, name).build()) .partitionKey(newDescriptor.isPartitioned() ? new PartitionKey() : null) .partitionListener(getPartitionListener()) .build();
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentPartitionStrategies() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(new PartitionStrategy.Builder() .hash("username", 2).build()) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(new PartitionStrategy.Builder() .hash("username", 2).hash("email", 3).build()) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Test public void testReadySignalUpdatesModifiedTime() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Dataset should not be ready before being signaled", ds.isReady()); // the modified time depends on the filesystem, and may only be granular to the second // signal and check until the modified time is after the current time, or until // enough time has past that the signal should have been distinguishable long signaledTime = 0; long currentTime = System.currentTimeMillis(); while(currentTime >= signaledTime && (System.currentTimeMillis() - currentTime) <= 2000) { ds.signalReady(); signaledTime = ds.getLastModified(); } Assert.assertTrue("Dataset should have been signaled as ready", ds.isReady()); Assert.assertTrue("Signal should update the modified time", signaledTime > currentTime); Assert.assertFalse("Only the dataset should have been signaled", ((Signalable)ds.with("username", "bob")).isReady()); }
@Test @SuppressWarnings("deprecation") public void testWriteToSubpartition() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); PartitionKey key = new PartitionKey(1); FileSystemDataset<Record> userPartition = (FileSystemDataset<Record>) ds.getPartition(key, true); Assert.assertEquals(key, userPartition.getPartitionKey()); writeTestUsers(userPartition, 1); Assert.assertTrue("Partitioned directory exists", fileSystem.exists(new Path(testDirectory, "username_part=1/email_hash=2"))); Assert.assertEquals(1, readTestUsersInPartition(ds, key, "email_hash")); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentFormats() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.AVRO) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(Formats.PARQUET) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Override public <E> Dataset<E> update(String namespace, String name, DatasetDescriptor descriptor, Class<E> type) { Preconditions.checkNotNull(namespace, "Namespace cannot be null"); Preconditions.checkNotNull(name, "Dataset name cannot be null"); Preconditions.checkNotNull(descriptor, "Descriptor cannot be null"); DatasetDescriptor oldDescriptor = metadataProvider.load(namespace, name); // oldDescriptor is valid if load didn't throw NoSuchDatasetException Compatibility.checkUpdate(oldDescriptor, descriptor); DatasetDescriptor updatedDescriptor = metadataProvider.update(namespace, name, descriptor); LOG.debug("Updated dataset: {} schema: {} location: {}", new Object[] { name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation() }); return new FileSystemDataset.Builder<E>() .namespace(namespace) .name(name) .configuration(conf) .descriptor(updatedDescriptor) .type(type) .uri(new URIBuilder(getUri(), namespace, name).build()) .partitionKey(updatedDescriptor.isPartitioned() ? new PartitionKey() : null) .partitionListener(getPartitionListener()) .build(); }
@Test(expected = ValidationException.class) public void testCannotMergeDatasetsWithDifferentSchemas() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(STRING_SCHEMA) .location(testDirectory) .build()) .type(Record.class) .build(); FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .build()) .type(Record.class) .build(); ds.merge(dsUpdate); }
@Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); Assert .assertNull(ds.getPartition(new PartitionKey(1), false)); }
@Before public void setUp() throws IOException { fileSystem = FileSystem.get(new Configuration()); testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); dataset = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(new Configuration()) .uri(URI.create("test")) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); }
@Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schemaUri(USER_SCHEMA_URL) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor() .isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); }
@Test public void testPathIterator_Directory() { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); Assert.assertEquals("dirIterator for non-partitioned dataset should yield a single path.", 1, dirPaths.size()); Assert.assertEquals("dirIterator should yield absolute paths.", testDirectory, dirPaths.get(0)); }
@Test public void signalReadyOnUnboundedDataset() { final FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("users") .configuration(getConfiguration()) .descriptor( new DatasetDescriptor.Builder().schema(USER_SCHEMA).format(format) .location(testDirectory).build()) .type(Record.class) .uri(URIBuilder.build(URI.create("repo:" + testDirectory.toUri()), "ns", "name")) .build(); Assert.assertFalse("Unbounded dataset has not been signaled", ds.isReady()); ds.signalReady(); Assert.assertTrue("Unbounded dataset has been signaled and should be ready", ds.isReady()); }