@Override public int run(String[] args) throws Exception { // Load the users dataset Dataset<Record> users = Datasets.load( "dataset:hdfs:/tmp/data/users", Record.class); // Get a reader for the dataset and read all the users DatasetReader<Record> reader = null; try { reader = users.with("favoriteColor", "green").newReader(); for (GenericRecord user : reader) { System.out.println(user); } } finally { if (reader != null) { reader.close(); } } return 0; }
@Test public void testConstraintWithEncodedCharacters() { assertViewUriEquivalent("encoded constraints", "view:file:/tmp/test_name?color=a%2Fb", test.with("color", "a/b")); }
@Test public void testMultiValueConstraintWithEncodedCharacters() { assertViewUriEquivalent("encoded multi-value constraints", "view:file:/tmp/test_name?color=a%2Cb,c", test.with("color", "a,b", "c")); }
@Test public void testMixedConstraintViews() { assertViewUriEquivalent("id, color, and time constraints", "view:file:/tmp/test_name?color=,orange&id=exists()×tamp=[0,9)", test.with("color", "", "orange").with("id") .from("timestamp", 0L).toBefore("timestamp", 9L)); }
@Test public void testSimpleViews() { assertViewUriEquivalent("dataset", "dataset:file:/tmp/test_name", test); assertViewUriEquivalent("to constraint", "view:file:/tmp/test_name?timestamp=(,0]", test.to("timestamp", 0L)); assertViewUriEquivalent("View with toBefore constraint", "view:file:/tmp/test_name?timestamp=(,0)", test.toBefore("timestamp", 0L)); assertViewUriEquivalent("View with from constraint", "view:file:/tmp/test_name?timestamp=[0,)", test.from("timestamp", 0L)); assertViewUriEquivalent("View with fromAfter constraint", "view:file:/tmp/test_name?timestamp=(0,)", test.fromAfter("timestamp", 0L)); assertViewUriEquivalent("View with in(\"\") constraint", "view:file:/tmp/test_name?color=in()", test.with("color", "")); assertViewUriEquivalent("View with in constraint", "view:file:/tmp/test_name?color=orange,red", test.with("color", "orange", "red")); assertViewUriEquivalent("View with exists constraint", "view:file:/tmp/test_name?id=", test.with("id")); }
@Test @SuppressWarnings("deprecation") public void testSignalReadyOutputView() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot"); DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
@Test public void testPartitionedDatasetWithEscapedChars() throws Exception { File folder = temp.newFolder("a/b/c/d/e/dataset_name"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath()); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schema(USER_SCHEMA) .partitionStrategy(new PartitionStrategy.Builder() .provided("s") .build()) .build(); Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor); // write two so that the descriptor uses the directory rather than a file writeUserToView(dataset.with("s", "test/-0")); writeUserToView(dataset.with("s", "test/-0")); Path datasetPath = new Path(folder.toURI()); Path partitionPath = new Path(datasetPath, "s=test%2F-0"); DatasetDescriptor actual = Iterables.getOnlyElement( FileSystemUtil.findPotentialDatasets(fs, root)); Assert.assertFalse("Should not flag at mixed depth", descriptor.hasProperty("kite.filesystem.mixed-depth")); Assert.assertEquals("Location should be at the partition directory", URI.create(partitionPath.toString()), actual.getLocation()); Assert.assertEquals("Should use user schema", USER_SCHEMA, actual.getSchema()); Assert.assertEquals("Should have Avro format", Formats.AVRO, actual.getFormat()); Assert.assertFalse("Should not be partitioned", actual.isPartitioned()); }
@Test public void testRefineIdentity() throws Exception { PartitionStrategy strategy = new PartitionStrategy.Builder() .identity("user_id") .build(); DatasetDescriptor descriptor = new DatasetDescriptor.Builder() .schemaUri("resource:standard_event.avsc") .partitionStrategy(strategy) .build(); // Create a separate dataset to avoid conflicts with the above. Dataset<StandardEvent> identityDataset = repo.create( "ns", "test_identity", descriptor); DatasetWriter<StandardEvent> writer = null; try { writer = identityDataset.newWriter(); writer.write(sepEvent); writer.write(octEvent); writer.write(novEvent); } finally { Closeables.close(writer, false); } assertContentEquals(Sets.newHashSet(sepEvent, novEvent), identityDataset.with("user_id", 0L)); } }
notPartitioned.toBefore("timestamp", now)); Assert.assertNotNull("with should succeed", notPartitioned.with("timestamp", now));