@Before public void setUp() throws IOException { fileSystem = FileSystem.get(new Configuration()); testDirectory = fileSystem.makeQualified( new Path(Files.createTempDir().getAbsolutePath())); partitionStrategy = new PartitionStrategy.Builder() .hash("username", "username_part", 2).hash("email", 3).build(); dataset = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(new Configuration()) .uri(URI.create("test")) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); }
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory, new EnusrePartitionPathDoesNotExistMetadataProvider(conf, testDirectory)); partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); view = new FileSystemView<Object>(users, null, null, Object.class); }
@Before public void setUp() throws IOException { this.conf = new Configuration(); this.fileSystem = FileSystem.get(conf); this.testDirectory = new Path(Files.createTempDir().getAbsolutePath()); this.repo = new FileSystemDatasetRepository(conf, testDirectory); PartitionStrategy partitionStrategy = new PartitionStrategy.Builder() .hash("username", 2).build(); FileSystemDataset<Object> users = (FileSystemDataset<Object>) repo.create( "ns", "users", new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(partitionStrategy) .build()); writer = PartitionedDatasetWriter.newWriter( new FileSystemView<Object>(users, null, null, Object.class)); }
@Test public void testProvidedPartitionIntUpdate() { final PartitionStrategy provided = new PartitionStrategy.Builder() .provided("part", "int") .build(); // existing partition data can be any int value Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .hash("s", "part", 16) .build(), PROVIDED_TEST_SCHEMA); Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .identity("l", "part") .build(), PROVIDED_TEST_SCHEMA); Compatibility.checkStrategyUpdate( provided, new PartitionStrategy.Builder() .identity("s", "part") .build(), PROVIDED_TEST_SCHEMA); }
@Test public void testPartitionedDataset() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset); writeUserToView(dataset); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "id_hash=1"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", partitionPath.toUri(), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
@Test public void testReplaceEmbeddedPartitionStrategy() { PartitionStrategy strategy = new PartitionStrategy.Builder() .hash("username", 16) .identity("username", "u") .build(); Schema original = new Schema.Parser().parse("{" + " \"type\": \"record\"," + " \"name\": \"User\"," + " \"partitions\": [" + " {\"type\": \"hash\", \"source\": \"real_name\", \"buckets\": 64}," + " {\"type\": \"identity\", \"source\": \"real_name\", \"name\": \"r\"}" + " ]," + " \"fields\": [" + " {\"name\": \"id\", \"type\": \"long\"}," + " {\"name\": \"username\", \"type\": \"string\"}," + " {\"name\": \"real_name\", \"type\": \"string\"}" + " ]" + "}"); Assert.assertTrue(PartitionStrategyParser.hasEmbeddedStrategy(original)); Assert.assertFalse(PartitionStrategyParser.parseFromSchema(original).equals(strategy)); Schema embedded = PartitionStrategyParser.embedPartitionStrategy(original, strategy); Assert.assertTrue(PartitionStrategyParser.hasEmbeddedStrategy(embedded)); Assert.assertEquals(strategy, PartitionStrategyParser.parseFromSchema(embedded)); } }
@Test public void testAllowedPartitionSchemaCombinations() { Compatibility.checkDescriptor( new DatasetDescriptor.Builder() .schema(schema) .partitionStrategy(new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .hour("timestamp") .minute("timestamp") .identity("message", "message_copy") .identity("timestamp", "ts") .identity("number", "num") .hash("message", 48) .hash("timestamp", 48) .hash("number", 48) .hash("payload", 48) .hash("float", 48) .hash("double", 48) .hash("bool", 48) .range("number", 5, 10, 15, 20) .range("message", "m", "z", "M", "Z") .build()) .build()); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .property("kite.writer.cache-size", "20") .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); // create a second dataset with the same partitioning for replacement parts temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); writeTestRecords(temporary); }
@Test @SuppressWarnings("deprecation") public void testGetPartitionReturnsNullIfNoAutoCreate() throws IOException { PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash( "username", 2).build(); FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("partitioned-users") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .format(format) .location(testDirectory) .partitionStrategy(partitionStrategy) .build()) .type(Record.class) .build(); Assert .assertNull(ds.getPartition(new PartitionKey(1), false)); }
@Test public void testCreatePartitioned() throws IOException { DatasetDescriptor requested = new DatasetDescriptor.Builder(testDescriptor) .partitionStrategy( new PartitionStrategy.Builder().hash("username", 3).build()) .build(); Assert.assertFalse("Sanity check", testProvider.exists(NAMESPACE, "test2")); Dataset dataset = repo.create(NAMESPACE, "test2", requested); DatasetDescriptor saved = testProvider.load(NAMESPACE, "test2"); Assert.assertNotNull("Dataset metadata is stored under name", saved); Assert.assertEquals("Saved metadata is returned", saved, dataset.getDescriptor()); Assert.assertEquals("Dataset name is propagated", "test2", dataset.getName()); Assert.assertEquals("Dataset schema is propagated", requested.getSchema(), saved.getSchema()); Assert.assertEquals("Dataset partition strategy propagated", requested.getPartitionStrategy(), saved.getPartitionStrategy()); }
@Test public void testAddEmbeddedPartitionStrategy() { PartitionStrategy strategy = new PartitionStrategy.Builder() .hash("username", 16) .identity("username", "u") .build(); Schema original = new Schema.Parser().parse("{" + " \"type\": \"record\"," + " \"name\": \"User\"," + " \"fields\": [" + " {\"name\": \"id\", \"type\": \"long\"}," + " {\"name\": \"username\", \"type\": \"string\"}," + " {\"name\": \"real_name\", \"type\": \"string\"}" + " ]" + "}"); Schema embedded = PartitionStrategyParser.embedPartitionStrategy(original, strategy); Assert.assertTrue(PartitionStrategyParser.hasEmbeddedStrategy(embedded)); Assert.assertEquals(strategy, PartitionStrategyParser.parseFromSchema(embedded)); }
@Before public void createTestDatasets() { Datasets.delete("dataset:file:/tmp/datasets/unpartitioned"); Datasets.delete("dataset:file:/tmp/datasets/partitioned"); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(TestRecord.class) .build(); unpartitioned = Datasets.create("dataset:file:/tmp/datasets/unpartitioned", descriptor, TestRecord.class); descriptor = new DatasetDescriptor.Builder(descriptor) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 4) .build()) .build(); partitioned = Datasets.create("dataset:file:/tmp/datasets/partitioned", descriptor, TestRecord.class); writeTestRecords(unpartitioned); writeTestRecords(partitioned); }
@Test public void test() throws Exception { final PartitionStrategy p = new PartitionStrategy.Builder() .identity("month", "month_ordinal", 12) .hash("userId", 7) .build(); List<FieldPartitioner> fieldPartitioners = p.getFieldPartitioners(); Assert.assertEquals(2, fieldPartitioners.size()); FieldPartitioner fp0 = fieldPartitioners.get(0); assertEquals("month_ordinal", fp0.getName()); assertEquals(12, fp0.getCardinality()); FieldPartitioner fp1 = fieldPartitioners.get(1); assertEquals("userId_hash", fp1.getName()); assertEquals(7, fp1.getCardinality()); assertEquals(12 * 7, p.getCardinality()); // useful for writers }
@Before public void setup() throws Exception { this.conf = new Configuration(); this.fs = FileSystem.get(conf); this.repo = newRepo(); this.strategy = new PartitionStrategy.Builder() .year("timestamp") .month("timestamp") .day("timestamp") .hash("user_id", 2) .build(); this.testDescriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); this.testDataset = repo.create("ns", "test", testDescriptor); }
@Override public void run() { new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .hash("id", 16) .build()) .columnMapping(new ColumnMapping.Builder() .key("id") .build()) .build(); } }
@Override public void run() { new DatasetDescriptor.Builder() .partitionStrategy(new PartitionStrategy.Builder() .hash("array", 48) .build()) .build(); } });
@Test public void testOverridePartitionStrategy() { PartitionStrategy strat = new PartitionStrategy.Builder() .hash("keyPart1", "keyPart1", 10).build(); AvroKeySchema avroKeySchema = parser.parseKeySchema(entitySchema, strat); assertEquals(Type.INT, avroKeySchema.getAvroSchema().getField("keyPart1") .schema().getType()); assertEquals(1, Accessor.getDefault().getFieldPartitioners(avroKeySchema .getPartitionStrategy()) .size()); }
@Override public void run() { new DatasetDescriptor.Builder() .schema(SchemaBuilder.record("Record").fields() .requiredString("field") .endRecord()) .partitionStrategy(new PartitionStrategy.Builder() .hash("array", 48) .build()) .build(); } });
@Override public void run() { new DatasetDescriptor.Builder() .schema(Schema.createArray(Schema.create(Schema.Type.FLOAT))) .partitionStrategy(new PartitionStrategy.Builder() .hash("array", 48) .build()) .build(); } });
@Override public void run() { new DatasetDescriptor.Builder() .schema((Schema) null) .partitionStrategy(new PartitionStrategy.Builder() .hash("array", 48) .build()) .build(); } });