valuesType = fieldPartitioner.get(VALUES).asText(); builder.provided(name, valuesType);
public static PartitionStrategy strategy(FileSystem fs, Path location) throws IOException { if (!fs.exists(location)) { return null; } List<Pair<String, Class<? extends Comparable>>> pairs = visit( new GetPartitionInfo(), fs, location); if (pairs == null || pairs.isEmpty() || pairs.size() <= 1) { return null; } PartitionStrategy.Builder builder = new PartitionStrategy.Builder(); // skip the initial partition because it is the containing directory for (int i = 1; i < pairs.size(); i += 1) { Pair<String, Class<? extends Comparable>> pair = pairs.get(i); builder.provided( pair.first() == null ? "partition_" + i : pair.first(), ProvidedFieldPartitioner.valuesString(pair.second())); } return builder.build(); }
strategyBuilder.minute(fieldName); } else if ("provided".equals(partitionerType)) { strategyBuilder.provided(fieldName); } else { throw new ValidationException(
.endRecord(); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("version", "int") .build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
@Test public void testProvided() { checkParser(new PartitionStrategy.Builder().provided("version").build(), "[ {\"type\": \"provided\", \"name\": \"version\"} ]"); checkParser(new PartitionStrategy.Builder().provided("version").build(), "[ {\"type\": \"provided\", \"name\": \"version\", \"values\": \"string\"} ]"); checkParser(new PartitionStrategy.Builder().provided("version", "string").build(), "[ {\"type\": \"provided\", \"name\": \"version\", \"values\": \"string\"} ]"); checkParser(new PartitionStrategy.Builder().provided("version", "int").build(), "[ {\"type\": \"provided\", \"name\": \"version\", \"values\": \"int\"} ]"); checkParser(new PartitionStrategy.Builder().provided("version", "long").build(), "[ {\"type\": \"provided\", \"name\": \"version\", \"values\": \"long\"} ]");
@Test public void testProvidedPartitionStringUpdate() { final PartitionStrategy provided = new PartitionStrategy.Builder() .provided("part", "string") .build();
@Test public void testPartitionedDatasetWithEscapedChars() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .provided("s") .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset.with("s", "test/-0")); writeUserToView(dataset.with("s", "test/-0")); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "s=test%2F-0"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", URI.create(partitionPath.toString()), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
.schema(event) .partitionStrategy(new PartitionStrategy.Builder() .provided("v", "int") .year("created_at") .month("created_at")
@Test public void testProvidedPartitionLongUpdate() { final PartitionStrategy provided = new PartitionStrategy.Builder() .provided("part", "long") .build();
@Test public void testMultipleFields() { checkParser(new PartitionStrategy.Builder() .provided("version") .hash("username", 64) .identity("username", "u") .year("time") .month("time") .day("time") .hour("time") .minute("time") .dateFormat("time", "datetime", "yyyy_MM_dd_HHmmss") .build(), "[ " + "{\"type\": \"provided\", \"name\": \"version\"}," + "{\"type\": \"hash\", \"source\": \"username\", \"buckets\": 64}," + "{\"type\": \"identity\"," + "\"source\": \"username\", \"name\": \"u\"}," + "{\"type\": \"year\", \"source\": \"time\"}," + "{\"type\": \"month\", \"source\": \"time\"}," + "{\"type\": \"day\", \"source\": \"time\"}," + "{\"type\": \"hour\", \"source\": \"time\"}," + "{\"type\": \"minute\", \"source\": \"time\"}," + "{\"type\": \"dateFormat\", \"source\": \"time\", " + "\"name\": \"datetime\", \"format\": \"yyyy_MM_dd_HHmmss\"}" + " ]"); }
@Test public void testMultipleAvroFilesAtDifferentDepths() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertTrue("Should flag data at mixed depth in the directory tree", DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor)); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
@Test public void testProvidedPartitionIntUpdate() { final PartitionStrategy provided = new PartitionStrategy.Builder() .provided("part", "int") .build(); // existing partition data can be any int value Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .hash("s", "part", 16) .build(), PROVIDED_TEST_SCHEMA); Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .identity("l", "part") .build(), PROVIDED_TEST_SCHEMA); Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .identity("s", "part") .build(), PROVIDED_TEST_SCHEMA); }
@Test public void testMultipleParquetFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createParquetEventFile(fs, new Path(parent, "part")); createParquetEventFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("partition_1", "string") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", EVENT_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Parquet format", Formats.PARQUET, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
@Test public void testMultipleAvroFilesInSeparateFolders() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "2")); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
@Test public void testMultipleMergeTablesAtDifferentDepths() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a two Avro files under separate folders Path parent = new Path(folder.toURI()); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, new Path(parent, "part=1")); createAvroUserFile(fs, parent); DatasetDescriptor descriptor = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); PartitionStrategy strategy = new PartitionStrategy.Builder() .provided("part", "int") .build(); Assert.assertTrue("Should flag data at mixed depth in the directory tree", DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor)); Assert.assertEquals("Should be directly under parent", parent.toUri(), descriptor.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, descriptor.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, descriptor.getFormat()); Assert.assertEquals("Should be partitioned by part=int", strategy, descriptor.getPartitionStrategy()); }
/** * Builds a {@link PartitionStrategy} from a list of Hive partition fields. * * @param fields a List of FieldSchemas * @return a PartitionStrategy for the Hive partitions */ @VisibleForTesting static PartitionStrategy fromPartitionColumns(List<FieldSchema> fields) { PartitionStrategy.Builder builder = new PartitionStrategy.Builder(); for (FieldSchema hiveSchema : fields) { TypeInfo type = HiveSchemaConverter.parseTypeInfo(hiveSchema.getType()); // any types not in the map will be treated as Strings builder.provided(hiveSchema.getName(), PROVIDED_TYPES.get(type.getTypeName())); } return builder.build(); }
@Test public void testProvidedPartitionNameUpdate() { final PartitionStrategy provided = new PartitionStrategy.Builder() .provided("part", "string") .build(); TestHelpers.assertThrows("Should not allow changing the partition name", ValidationException.class, new Runnable() { @Override public void run() { Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .identity("s", "other") .build(), PROVIDED_TEST_SCHEMA); } }); }
/** * Builds a {@link PartitionStrategy} from a list of Hive partition fields. * * @param fields a List of FieldSchemas * @return a PartitionStrategy for the Hive partitions */ @VisibleForTesting static PartitionStrategy fromPartitionColumns(List<FieldSchema> fields) { PartitionStrategy.Builder builder = new PartitionStrategy.Builder(); for (FieldSchema hiveSchema : fields) { TypeInfo type = HiveSchemaConverter.parseTypeInfo(hiveSchema.getType()); // any types not in the map will be treated as Strings builder.provided(hiveSchema.getName(), PROVIDED_TYPES.get(type.getTypeName())); } return builder.build(); }