@Override public Void call() throws IOException { FileSystemUtil.findPotentialDatasets(fs, root); return null; } });
@Test public void testEmptyDirectory() throws IOException { Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
@Test public void testSingleUnknownFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createUnknownFile(fs, parent); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
@Test public void testMultipleUnknownFiles() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createUnknownFile(fs, parent); createUnknownFile(fs, parent); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
@Test public void testEmptyDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Datasets.create(datasetUri, descriptor); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
@Test public void testSingleAvroFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri().getPath(), parent(descriptor.getLocation()).getPath()); Assert.assertTrue("Should be a .avro file", descriptor.getLocation().toString().endsWith(".avro")); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testSingleParquetFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri().getPath(), parent(descriptor.getLocation()).getPath()); Assert.assertTrue("Should be a .parquet file", descriptor.getLocation().toString().endsWith(".parquet")); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testUnpartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); DatasetDescriptor expected = dataset.getDescriptor(); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertEquals("Should succeed and find an equivalent descriptor", expected, actual); }
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleParquetFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleMergeTablesAtDifferentDepths() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertTrue("Should flag data at mixed depth in the directory tree", DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor)); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
@Test public void testMultipleAvroFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
@Test public void testMultipleParquetFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createParquetEventFile(fs, new Path(parent, "part")); createParquetEventFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("partition_1", "string") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
.findPotentialDatasets(fs, root);
.findPotentialDatasets(fs, root);
@Test public void testMultipleAvroFilesAtDifferentDepths() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertTrue("Should flag data at mixed depth in the directory tree", DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor)); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
.findPotentialDatasets(fs, root);
@Test public void testPartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "id_hash=1"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", partitionPath.toUri(), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
@Test public void testPartitionedDatasetWithEscapedChars() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .provided("s") .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset.with("s", "test/-0")); writeUserToView(dataset.with("s", "test/-0")); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "s=test%2F-0"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", URI.create(partitionPath.toString()), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
.findPotentialDatasets(fs, root);