public Builder<E> descriptor(DatasetDescriptor descriptor) { Preconditions.checkArgument(descriptor.getLocation() != null, "Dataset location cannot be null"); this.descriptor = descriptor; return this; }
/** * Checks that the {@code existing} {@link DatasetDescriptor} can be replaced * by {@code updated}. * * @param existing the current {@code DatasetDescriptor} for a dataset * @param updated a new {@code DatasetDescriptor} for the same dataset */ public static void checkUpdate(DatasetDescriptor existing, DatasetDescriptor updated) { checkNotChanged("location", existing.getLocation(), updated.getLocation()); checkCompatible(existing, updated); }
private static boolean isLocal(Dataset<?> dataset) { URI location = dataset.getDescriptor().getLocation(); return (location != null) && LOCAL_FS_SCHEME.equals(location.getScheme()); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder writeTo(View<?> view) { if (view instanceof FileSystemDataset) { FileSystemDataset dataset = (FileSystemDataset) view; conf.set(KITE_PARTITION_DIR, String.valueOf(dataset.getDescriptor().getLocation())); } withType(view.getType()); return writeTo(view.getUri()); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder writeTo(View<?> view) { if (view instanceof FileSystemDataset) { FileSystemDataset dataset = (FileSystemDataset) view; conf.set(KITE_PARTITION_DIR, String.valueOf(dataset.getDescriptor().getLocation())); } withType(view.getType()); return writeTo(view.getUri()); }
public FileSystemDataset<E> build() { Preconditions.checkState(this.namespace != null, "No namespace defined"); Preconditions.checkState(this.name != null, "No dataset name defined"); Preconditions.checkState(this.descriptor != null, "No dataset descriptor defined"); Preconditions.checkState((conf != null) || (fileSystem != null), "Configuration or FileSystem must be set"); Preconditions.checkState(type != null, "No type specified"); this.directory = new Path(descriptor.getLocation().toString()); if (fileSystem == null) { try { this.fileSystem = directory.getFileSystem(conf); } catch (IOException ex) { throw new DatasetIOException("Cannot access FileSystem", ex); } } Path absoluteDirectory = fileSystem.makeQualified(directory); return new FileSystemDataset<E>( fileSystem, absoluteDirectory, namespace, name, descriptor, uri, partitionKey, partitionListener, type); } }
@Test public void testCreatePath() throws IOException { Dataset<Record> created = repo.create(NAMESPACE, NAME, testDescriptor); URI location = created.getDescriptor().getLocation(); Assert.assertNotNull( "FileSystemDatasetRepository should return descriptor locations", location); Assert.assertTrue("Dataset data directory:" + location + " should exist", fileSystem.exists(new Path(location))); }
@Test public void testCreateIgnoresLocation() throws IOException { DatasetDescriptor created = provider.create(NAMESPACE, NAME, testDescriptor); Assert.assertNull("Created descriptor should not have a location", created.getLocation()); }
@Test public void testDeleteRemovesMetadataFiles() throws IOException { testCreateMetadataFiles(); DatasetDescriptor loaded = provider.load(NAMESPACE, NAME); Path namedDirectory = new Path(loaded.getLocation()); Path metadataDirectory = new Path(namedDirectory, ".metadata"); Path propertiesFile = new Path(metadataDirectory, "descriptor.properties"); Path schemaDirectory = new Path(metadataDirectory, "schemas"); boolean result = provider.delete(NAMESPACE, NAME); Assert.assertTrue(result); Assert.assertFalse("Descriptor properties file should not exist", fileSystem.exists(propertiesFile)); Assert.assertFalse("Descriptor schema directory should not exist", fileSystem.exists(schemaDirectory)); Assert.assertFalse("Metadata directory should not exist", fileSystem.exists(metadataDirectory)); Assert.assertTrue("Named directory should still exist for name:" + NAME, fileSystem.exists(namedDirectory)); }
@Test public void testDeleteRemovesDatasetPath() throws IOException { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); Path dataPath = new Path(dataset.getDescriptor().getLocation()); Assert.assertTrue(fileSystem.exists(dataPath)); repo.delete(NAMESPACE, NAME); Assert.assertFalse(fileSystem.exists(dataPath)); }
@Test public void testUpdatePreviousFormat() throws IOException { useOldRepositoryFormat(); DatasetDescriptor oldFormatDescriptor = provider.load(NAMESPACE, NAME); Path namedDirectory = new Path(oldFormatDescriptor.getLocation()); Path metadataDirectory = new Path(namedDirectory, ".metadata"); Path schemaDirectory = new Path(metadataDirectory, "schemas"); Path newSchemaLocation = new Path(schemaDirectory, "1.avsc"); // Performing an update against a dataset in the old location should bring it // into the new location. DatasetDescriptor updated = new DatasetDescriptor.Builder(oldFormatDescriptor).build(); provider.update(NAMESPACE, NAME, updated); Assert.assertEquals(testDescriptor.getSchema(), oldFormatDescriptor.getSchema()); Assert.assertTrue("Schema should exist at the new location.", fileSystem.exists(newSchemaLocation)); } }
@Test public void testUpdateFailsWithLocationChange() { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); URI location = dataset.getDescriptor().getLocation(); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .location(new Path(testDirectory, "newDataLocation").toUri()) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to data location change"); } catch (ValidationException ex) { // expected } Assert.assertEquals( location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation()); }
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleParquetFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testCreateWithLocation() throws URISyntaxException { Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, NAME)); String auth = getDFS().getUri().getAuthority(); URI requestedLocation = new URI("hdfs://" + auth + "/tmp/data/my_data_set"); DatasetDescriptor requested = new DatasetDescriptor.Builder(testDescriptor) .location(requestedLocation) .build(); final DatasetDescriptor created; try { created = provider.create(NAMESPACE, NAME, requested); } catch (UnsupportedOperationException ex) { // this is expected if the provider doesn't support requested locations return; } // if supported, the location should be unchanged. Assert.assertNotNull("Descriptor should be returned", created); Assert.assertTrue("Descriptor should exist", provider.exists(NAMESPACE, NAME)); Assert.assertEquals("Requested locations should match", requestedLocation, created.getLocation()); }
@Test public void testRelative() { DatasetRepository repo = DatasetRepositories.repositoryFor("repo:file:target/data"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Record> ds = Datasets.<Record, Dataset<Record>> load("dataset:file:target/data/ns/test", Record.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Path cwd = localFS.makeQualified(new Path(".")); Assert.assertEquals("Locations should match", new Path(cwd, "target/data/ns/test").toUri(), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); Assert.assertEquals("Should report correct namespace", "ns", ds.getNamespace()); Assert.assertEquals("Should report correct name", "test", ds.getName()); repo.delete("ns", "test"); }
@Test public void testCreate() { Assert.assertFalse("Sanity check", testProvider.exists(NAMESPACE, NAME)); Dataset dataset = repo.create(NAMESPACE, NAME, testDescriptor); Assert.assertNotNull("Dataset should be returned", dataset); Assert.assertTrue("Dataset should exist", repo.exists(NAMESPACE, NAME)); DatasetDescriptor saved = testProvider.load(NAMESPACE, NAME); Assert.assertNotNull("Dataset metadata is stored under name", saved); Assert.assertEquals("Saved metadata is returned", saved, dataset.getDescriptor()); // TODO: Add test for namespace accessor Assert.assertEquals("Dataset name is propagated", NAME, dataset.getName()); Assert.assertEquals("Dataset schema is propagated", testDescriptor.getSchema(), saved.getSchema()); Assert.assertNotNull("Dataset should have a URI location", saved.getLocation()); Assert.assertNotNull("Dataset location should have a scheme", saved.getLocation().getScheme()); }
@Test public void testAbsolute() { DatasetRepository repo = DatasetRepositories.repositoryFor("repo:file:/tmp/data"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Record> ds = Datasets.<Record, Dataset<Record>> load("dataset:file:/tmp/data/ns/test", Record.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Locations should match", URI.create("file:/tmp/data/ns/test"), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); Assert.assertEquals("Should report correct namespace", "ns", ds.getNamespace()); Assert.assertEquals("Should report correct name", "test", ds.getName()); repo.delete("ns", "test"); }
@Test public void testAbsoluteTrailingSlash() { DatasetRepository repo = DatasetRepositories .repositoryFor("repo:hdfs://" + hdfsAuth + "/tmp/data/"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Object> ds = Datasets.<Object, Dataset<Object>> load("dataset:hdfs://" + hdfsAuth + "/tmp/data/ns/test/", Object.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Locations should match", URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); Assert.assertEquals("Should report correct namespace", "ns", ds.getNamespace()); Assert.assertEquals("Should report correct name", "test", ds.getName()); repo.delete("ns", "test"); }
@Test public void testAbsolute() { DatasetRepository repo = DatasetRepositories .repositoryFor("repo:hdfs://" + hdfsAuth + "/tmp/data"); repo.delete("ns", "test"); repo.create("ns", "test", descriptor); Dataset<Object> ds = Datasets.<Object, Dataset<Object>> load("dataset:hdfs://" + hdfsAuth + "/tmp/data/ns/test", Object.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof FileSystemDataset); Assert.assertEquals("Locations should match", URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"), ds.getDescriptor().getLocation()); Assert.assertEquals("Descriptors should match", repo.load("ns", "test").getDescriptor(), ds.getDescriptor()); Assert.assertEquals("Should report correct namespace", "ns", ds.getNamespace()); Assert.assertEquals("Should report correct name", "test", ds.getName()); repo.delete("ns", "test"); }