private static <E> Dataset<E> loadOrCreateTaskAttemptDataset(TaskAttemptContext taskContext) { String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); DatasetRepository repo = getDatasetRepository(taskContext); Dataset<E> jobDataset = loadJobDataset(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { return repo.load(TEMP_NAMESPACE, taskAttemptDatasetName); } else { return repo.create(TEMP_NAMESPACE, taskAttemptDatasetName, copy(jobDataset.getDescriptor())); } }
@Override @SuppressWarnings("unchecked") public void commitTask(TaskAttemptContext taskContext) throws IOException { DatasetRepository repo = getDatasetRepository(taskContext); boolean inTempRepo = repo instanceof TemporaryDatasetRepository; Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, getJobDatasetName(taskContext)); String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { Dataset<E> taskAttemptDataset = repo.load(TEMP_NAMESPACE, taskAttemptDatasetName); ((Mergeable<Dataset<E>>) jobDataset).merge(taskAttemptDataset); if (!inTempRepo) { repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName); } } }
/** * List the {@link Dataset} URIs in the repository identified by the URI. * <p> * URI formats are defined by {@code Dataset} implementations. The repository * URIs you pass to this method must begin with {@code repo:}. For example, to * list the {@code Dataset} URIs for the Hive repository, provide the URI * {@code repo:hive}. * * @param uri a {@code DatasetRepository} URI * @return the URIs present in the {@code DatasetRepository} * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a repository URI */ public static Collection<URI> list(URI uri) { boolean isRepo = URIBuilder.REPO_SCHEME.equals(uri.getScheme()); Preconditions.checkArgument(isRepo, "Not a repository URI: " + uri); DatasetRepository repo = Registration .open(URI.create(uri.getRawSchemeSpecificPart())); // build a URI for each dataset name URI repoUri = repo.getUri(); List<URI> datasets = Lists.newArrayList(); for (String namespace : repo.namespaces()) { for (String dataset : repo.datasets(namespace)) { datasets.add(new URIBuilder(repoUri, namespace, dataset).build()); } } return datasets; }
private static void deleteTaskAttemptDataset(TaskAttemptContext taskContext) { DatasetRepository repo = getDatasetRepository(taskContext); String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName); } }
private static boolean deleteWithTrash(URI uri, boolean useTrash){ Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); return useTrash ? repo.moveToTrash(uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION)) : repo.delete(uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION)); }
@Test public void testBasic() { DatasetRepository repo = DatasetRepositories.repositoryFor(repositoryUri); repo.delete("default", "test"); repo.create("default", "test", descriptor); RandomAccessDataset<Object> ds = Datasets .<Object, RandomAccessDataset<Object>>load(URI.create("dataset:hbase:" + zk + "/test"), Object.class); Assert.assertNotNull("Should load dataset", ds); Assert.assertTrue(ds instanceof DaoDataset); Assert.assertEquals("Descriptors should match", repo.load("default", "test").getDescriptor(), ds.getDescriptor()); repo.delete("default", "test"); }
@Test public void testUpdateFailsWithFormatChange() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder(testDescriptor) .format(Formats.AVRO) .build()); DatasetDescriptor changed = new DatasetDescriptor.Builder(dataset.getDescriptor()) .format(Formats.PARQUET) .build(); try { repo.update(NAMESPACE, NAME, changed); Assert.fail("Should fail due to format change"); } catch (ValidationException e) { // expected } Assert.assertEquals( Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat()); }
@Test(expected=NullPointerException.class) public void testCreateNullNamespace() { repo.create(null, NAME, testDescriptor); }
private static <E> Dataset<E> loadJobDataset(JobContext jobContext) { DatasetRepository repo = getDatasetRepository(jobContext); return repo.load(TEMP_NAMESPACE, getJobDatasetName(jobContext)); }
@Test public void testExists() { Assert.assertFalse(repo.exists(NAMESPACE, "test1")); repo.create(NAMESPACE, "test1", new DatasetDescriptor.Builder() .schema(testSchema).build()); Assert.assertTrue(repo.exists(NAMESPACE, "test1")); repo.delete(NAMESPACE, "test1"); Assert.assertFalse(repo.exists(NAMESPACE, "test1")); }
private static void deleteJobDataset(JobContext jobContext) { DatasetRepository repo = getDatasetRepository(jobContext); repo.delete(TEMP_NAMESPACE, getJobDatasetName(jobContext)); }
@Test public void testUpdateSuccessfulWithCompatibleSchemaChangeFieldRemoved() { Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder() .schema(testSchema).build()); writeTestUsers(dataset, 5, 0, "email"); checkTestUsers(dataset, 5, "email"); Schema testSchemaV2 = SchemaBuilder.record("user").fields() .requiredString("username") .endRecord(); Dataset<Record> datasetV2 = repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(dataset.getDescriptor()) .schema(testSchemaV2) .build()); Assert.assertEquals("Dataset schema is updated", testSchemaV2, datasetV2 .getDescriptor().getSchema()); // test that the old records can be read back with the new schema checkTestUsers(datasetV2, 5, new String[0]); // write more users and test that the mixed set can be read back with the new schema writeTestUsers(datasetV2, 5, 5, new String[0]); checkTestUsers(datasetV2, 10, new String[0]); }
@Test public void testListDatasets() { Assert.assertEquals(ImmutableMultiset.<String>of(), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); repo.create(NAMESPACE, "test1", testDescriptor); Assert.assertEquals(ImmutableMultiset.of("test1"), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); repo.create(NAMESPACE, "test2", testDescriptor); Assert.assertEquals(ImmutableMultiset.of("test1", "test2"), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); repo.create(NAMESPACE, "test3", testDescriptor); Assert.assertEquals(ImmutableMultiset.of("test1", "test2", "test3"), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); repo.delete(NAMESPACE, "test2"); Assert.assertEquals(ImmutableMultiset.of("test1", "test3"), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); repo.delete(NAMESPACE, "test3"); Assert.assertEquals(ImmutableMultiset.of("test1"), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); repo.delete(NAMESPACE, "test1"); Assert.assertEquals(ImmutableMultiset.<String>of(), ImmutableMultiset.copyOf(repo.datasets(NAMESPACE))); }
@Test public void testDeleteRemovesDatasetPath() throws IOException { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); Path dataPath = new Path(dataset.getDescriptor().getLocation()); Assert.assertTrue(fileSystem.exists(dataPath)); repo.delete(NAMESPACE, NAME); Assert.assertFalse(fileSystem.exists(dataPath)); }
Map<String, String> uriOptions = pair.second(); return (D) repo.update( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION), descriptor, type);
@Test public void testCreate() { Assert.assertFalse("Sanity check", testProvider.exists(NAMESPACE, NAME)); Dataset dataset = repo.create(NAMESPACE, NAME, testDescriptor); Assert.assertNotNull("Dataset should be returned", dataset); Assert.assertTrue("Dataset should exist", repo.exists(NAMESPACE, NAME)); DatasetDescriptor saved = testProvider.load(NAMESPACE, NAME); Assert.assertNotNull("Dataset metadata is stored under name", saved); Assert.assertEquals("Saved metadata is returned", saved, dataset.getDescriptor()); // TODO: Add test for namespace accessor Assert.assertEquals("Dataset name is propagated", NAME, dataset.getName()); Assert.assertEquals("Dataset schema is propagated", testDescriptor.getSchema(), saved.getSchema()); Assert.assertNotNull("Dataset should have a URI location", saved.getLocation()); Assert.assertNotNull("Dataset location should have a scheme", saved.getLocation().getScheme()); }
@VisibleForTesting URI getLegacyRepoUri(URI datasetUri, String namespace) { URI repoUri = DatasetRepositories.repositoryFor(datasetUri).getUri(); URI specificUri = URI.create(repoUri.getSchemeSpecificPart()); String repoScheme = specificUri.getScheme(); if (Sets.newHashSet("hdfs", "file", "hive").contains(repoScheme)) { try { specificUri = new URI(specificUri.getScheme(), specificUri.getUserInfo(), specificUri.getHost(), specificUri.getPort(), specificUri.getPath() + "/" + namespace, specificUri.getQuery(), specificUri.getFragment()); repoUri = URI.create("repo:" + specificUri.toString()); } catch (URISyntaxException ex) { throw new DatasetException("Error generating legacy URI", ex); } } return repoUri; }
/** * Check whether a {@link Dataset} identified by the given URI exists. * <p> * URIs must begin with {@code dataset:}. The remainder of * the URI is implementation specific, depending on the dataset scheme. * * @param uri a {@code Dataset} URI * @return {@code true} if the dataset exists, {@code false} otherwise * @throws NullPointerException if {@code uri} is null * @throws IllegalArgumentException if {@code uri} is not a dataset URI */ public static boolean exists(URI uri) { Preconditions.checkArgument( URIBuilder.DATASET_SCHEME.equals(uri.getScheme()), "Not a dataset URI: " + uri); Pair<DatasetRepository, Map<String, String>> pair = Registration.lookupDatasetUri(URI.create(uri.getRawSchemeSpecificPart())); DatasetRepository repo = pair.first(); Map<String, String> uriOptions = pair.second(); return repo.exists( uriOptions.get(URIBuilder.NAMESPACE_OPTION), uriOptions.get(URIBuilder.DATASET_NAME_OPTION)); }