/** * Returns whether the value of the descriptor property is {@code true}. * * @param property a String property name * @param descriptor a {@link DatasetDescriptor} * @return {@code true} if set and "true", {@code false} otherwise. */ public static boolean isEnabled(String property, DatasetDescriptor descriptor) { if (descriptor.hasProperty(property)) { // return true if and only if the property value is "true" return Boolean.valueOf(descriptor.getProperty(property)); } return false; }
/** * Returns whether the value of the descriptor property is {@code false}. * * @param property a String property name * @param descriptor a {@link DatasetDescriptor} * @return {@code true} if set and "false", {@code false} otherwise. */ public static boolean isDisabled(String property, DatasetDescriptor descriptor) { if (descriptor.hasProperty(property)) { // return true if and only if the property value is "false" return !Boolean.valueOf(descriptor.getProperty(property)); } return false; }
private HColumnDescriptor configure(HColumnDescriptor column, DatasetDescriptor descriptor) { if (descriptor.hasProperty(REPLICATION_ID_PROP)) { String value = descriptor.getProperty(REPLICATION_ID_PROP); try { column.setScope(Integer.valueOf(value)); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Invalid replication scope: " + value, e); } } return column; }
private HColumnDescriptor configure(HColumnDescriptor column, DatasetDescriptor descriptor) { if (descriptor.hasProperty(REPLICATION_ID_PROP)) { String value = descriptor.getProperty(REPLICATION_ID_PROP); try { column.setScope(Integer.valueOf(value)); } catch (NumberFormatException e) { throw new IllegalArgumentException( "Invalid replication scope: " + value, e); } } return column; }
/** * Returns the value of the property parsed as a long, or the default value. * <p> * If the value cannot be parsed as a long, this will return the default * value. * * @param prop a String property name * @param descriptor a {@link DatasetDescriptor} * @param defaultValue default value if prop is not present or is invalid * @return the value of prop parsed as a long or the default value */ public static long getLong(String prop, DatasetDescriptor descriptor, long defaultValue) { if (descriptor.hasProperty(prop)) { String asString = descriptor.getProperty(prop); try { return Long.parseLong(asString); } catch (NumberFormatException e) { // return the default value } } return defaultValue; }
@Test public void testSingleAvroFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri().getPath(), parent(descriptor.getLocation()).getPath()); Assert.assertTrue("Should be a .avro file", descriptor.getLocation().toString().endsWith(".avro")); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testSingleParquetFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri().getPath(), parent(descriptor.getLocation()).getPath()); Assert.assertTrue("Should be a .parquet file", descriptor.getLocation().toString().endsWith(".parquet")); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleAvroFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files in parent Path parent = new Path(folder.toURI()); createAvroUserFile(fs, parent); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
@Test public void testMultipleParquetFilesInOneFolder() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createParquetEventFile(fs, parent); createParquetEventFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use event schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned()); }
users1.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", new Path(parent, "part=1").toUri(), users2.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", new Path(parent, "part=2").toUri(),
users.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", new Path(parent, "part=1").toUri(), events.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", new Path(parent, "part=2").toUri(),
@Test public void testMultipleAvroFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
@Test public void testMultipleParquetFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createParquetEventFile(fs, new Path(parent, "part")); createParquetEventFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("partition_1", "string") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
users.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", new Path(parent, "part=1").toUri(), events.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", new Path(parent, "part=2").toUri(),
@Test public void testPartitionedDatasetWithEscapedChars() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .provided("s") .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset.with("s", "test/-0")); writeUserToView(dataset.with("s", "test/-0")); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "s=test%2F-0"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", URI.create(partitionPath.toString()), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
@Test public void testCustomProperties() { final String propName = "my.custom.property"; final String propValue = "string"; DatasetDescriptor descriptorWithProp = new DatasetDescriptor.Builder(testDescriptor) .property(propName, propValue) .build(); DatasetDescriptor created = provider.create(NAMESPACE, NAME, descriptorWithProp); Assert.assertTrue("Should have custom property", created.hasProperty(propName)); Assert.assertEquals("Should have correct custom property value", propValue, created.getProperty(propName)); Assert.assertTrue("List should contain property name", created.listProperties().contains(propName)); DatasetDescriptor loaded = provider.load(NAMESPACE, NAME); Assert.assertTrue("Should have custom property", loaded.hasProperty(propName)); Assert.assertEquals("Should have correct custom property value", propValue, loaded.getProperty(propName)); Assert.assertTrue("List should contain property name", created.listProperties().contains(propName)); }
@Test public void testPartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "id_hash=1"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", partitionPath.toUri(), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
avro.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), parquet.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(),