@Override public FileSystemWriter<E> load(StorageKey key) throws Exception { Preconditions.checkState(view.getDataset() instanceof FileSystemDataset, "FileSystemWriters cannot create writer for " + view.getDataset()); FileSystemDataset dataset = (FileSystemDataset) view.getDataset(); Path partition = convert.fromKey(key); FileSystemWriter<E> writer = FileSystemWriter.newWriter( dataset.getFileSystem(), new Path(dataset.getDirectory(), partition), conf.getRollIntervalMillis(), conf.getTargetFileSize(), dataset.getDescriptor(), view.getAccessor().getWriteSchema()); PartitionListener listener = dataset.getPartitionListener(); if (listener != null) { listener.partitionAdded( dataset.getNamespace(), dataset.getName(), partition.toString()); } // initialize the writer after calling the listener // this lets the listener decide if and how to create the // partition directory writer.initialize(); return writer; }
private InputFormat<E, Void> getDelegateInputFormatForPartition(Dataset<E> dataset, String partitionDir, Configuration conf) { if (!(dataset instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + dataset); } FileSystemDataset<E> fsDataset = (FileSystemDataset<E>) dataset; LOG.debug("Getting delegate input format for dataset {} with partition directory {}", dataset, partitionDir); PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); LOG.debug("Partition key: {}", key); if (key != null) { PartitionedDataset<E> partition = fsDataset.getPartition(key, false); LOG.debug("Partition: {}", partition); return getDelegateInputFormat(partition, conf); } throw new DatasetException("Cannot find partition " + partitionDir); }
@SuppressWarnings("unchecked") private boolean setInputPaths(JobContext jobContext, Job job) throws IOException { List<Path> paths = Lists.newArrayList((Iterator) (view == null ? dataset.pathIterator() : view.pathIterator())); LOG.debug("Input paths: {}", paths); if (paths.isEmpty()) { return false; } FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); // the following line is needed for Hadoop 1, otherwise the paths are not set Configuration contextConf = Hadoop.JobContext .getConfiguration.invoke(jobContext); Configuration jobConf = Hadoop.JobContext .getConfiguration.invoke(job); contextConf.set("mapred.input.dir", jobConf.get("mapred.input.dir")); return true; }
FileSystemView(FileSystemDataset<E> dataset, @Nullable PartitionListener listener, @Nullable SignalManager signalManager, Class<E> type) { super(dataset, type); this.fs = dataset.getFileSystem(); this.root = dataset.getDirectory(); this.listener = listener; this.signalManager = signalManager; }
public FileSystemViewKeyInputFormat(FileSystemDataset<E> dataset, Configuration conf) { this.dataset = dataset; this.view = null; LOG.debug("Dataset: {}", dataset); Format format = dataset.getDescriptor().getFormat(); setConfigProperties(conf, format, dataset.getSchema(), dataset.getType()); }
FileSystemPartitionView<TestRecord> partition0 = partitioned.getPartitionView( new Path("id_hash=0")); FileSystemPartitionView<TestRecord> temp0 = temporary.getPartitionView( new Path("id_hash=0")); new Path(partitioned.getDirectory(), "id_hash=1"), new Path(partitioned.getDirectory(), "0")); local.rename( new Path(partitioned.getDirectory(), "id_hash=2"), new Path(partitioned.getDirectory(), "hash=0")); local.rename( new Path(partitioned.getDirectory(), "id_hash=3"), new Path(partitioned.getDirectory(), "id_hash=00")); partitioned.canReplace(partition0)); Assert.assertFalse( "Should not allow replacement test with a different dataset", partitioned.canReplace(temp0)); partitioned.replace(partition0, temp0); Iterators.transform(partitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", replacementFiles, replacedFiles); Iterator<Path> dirIterator = partitioned.dirIterator(); Path onlyDirectory = dirIterator.next(); Assert.assertFalse("Should contain only one directory",
.build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor() .isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor() .getPartitionStrategy()); checkTestUsers(ds, 10); List<Path> dirPaths = Lists.newArrayList(ds.dirIterator()); ds.getPartition(new PartitionKey(1, 2), false); List<Path> leafPaths = Lists.newArrayList(partition.dirIterator()); Assert.assertEquals(1, leafPaths.size()); final Path leafPath = leafPaths.get(0); Assert.assertTrue("dirIterator should yield absolute paths.", leafPath.isAbsolute()); Assert.assertEquals(new PartitionKey(1, 2), ds.keyFromDirectory(leafPath)); Assert.assertEquals(new PartitionKey(1), ds.keyFromDirectory(leafPath.getParent())); Assert.assertEquals(new PartitionKey(), ds.keyFromDirectory(leafPath.getParent().getParent()));
@Test public void testUnpartitionedReplace() { // recreate temporary without a partition strategy Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor .Builder(unpartitioned.getDescriptor()) .location((URI) null) // clear the location .build(); temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); Assert.assertTrue("Should allow replacing an unpartitioned dataset", unpartitioned.canReplace(unpartitioned)); // make sure there are multiple files writeTestRecords(unpartitioned); writeTestRecords(unpartitioned); writeTestRecords(temporary); writeTestRecords(temporary); Set<String> originalFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temporary.pathIterator(), new GetFilename())); Iterators.transform(temporary.pathIterator(), new GetFilename()); Assert.assertFalse("Sanity check", originalFiles.equals(replacementFiles)); unpartitioned.replace(unpartitioned, temporary); Set<String> replacedFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", replacementFiles, replacedFiles); }
@Override public void merge(FileSystemDataset<E> update) { DatasetDescriptor updateDescriptor = update.getDescriptor(); for (PartitionView<E> src : update.getCoveringPartitions()) { if (src instanceof FileSystemPartitionView) { URI relative = ((FileSystemPartitionView<E>) src).getRelativeLocation(); PartitionView<E> dest = relative != null ? getPartitionView(relative) : unbounded;
partitioned.canReplace(partitioned)); Assert.assertTrue( "Should not allow replacement test with a different dataset", !partitioned.canReplace(temporary)); new Path(temporary.getDirectory(), "id_hash=1"), true /* recursive */); local.delete( new Path(temporary.getDirectory(), "id_hash=3"), true /* recursive */); new Path(partitioned.getDirectory(), "id_hash=2"), true /* recursive */); Iterators.transform(partitioned.pathIterator(), new GetFilename())); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temporary.pathIterator(), new GetFilename())); Assert.assertFalse("Sanity check", originalFiles.equals(replacementFiles)); partitioned.replace(partitioned, temporary); Iterators.transform(partitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the only the replacement files", replacementFiles, replacedFiles);
@Test public void testReplaceSinglePartition() { FileSystemPartitionView<TestRecord> partition0 = partitioned.getPartitionView( new Path("id_hash=0")); FileSystemPartitionView<TestRecord> temp0 = temporary.getPartitionView( new Path("id_hash=0")); Assert.assertTrue("Should allow replacing a single partition", partitioned.canReplace(partition0)); Assert.assertFalse( "Should not allow replacement test with a different dataset", partitioned.canReplace(temp0)); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temp0.pathIterator(), new GetFilename())); Set<String> originalPartitionFiles = Sets.newHashSet( Iterators.transform(partition0.pathIterator(), new GetFilename())); Assert.assertEquals("Sanity check", originalPartitionFiles.size(), replacementFiles.size()); Assert.assertFalse("Sanity check", originalPartitionFiles.equals(replacementFiles)); Set<String> expectedFiles = Sets.newHashSet( Iterators.transform(partitioned.pathIterator(), new GetFilename())); expectedFiles.removeAll(originalPartitionFiles); expectedFiles.addAll(replacementFiles); partitioned.replace(partition0, temp0); Set<String> replacedFiles = Sets.newHashSet( Iterators.transform(partitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", expectedFiles, replacedFiles); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder writeTo(View<?> view) { if (view instanceof FileSystemDataset) { FileSystemDataset dataset = (FileSystemDataset) view; conf.set(KITE_PARTITION_DIR, String.valueOf(dataset.getDescriptor().getLocation())); } withType(view.getType()); return writeTo(view.getUri()); }
.build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor() .isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor() .getPartitionStrategy()); for (Dataset dataset : ds.getPartitions()) { Assert.assertFalse("Partitions should not have further partitions", dataset.getDescriptor().isPartitioned());
@Test public void testPartitionedReplace() { Assert.assertTrue("Should allow replacing a whole dataset", partitioned.canReplace(partitioned)); Assert.assertTrue( "Should not allow replacement test with a different dataset", !partitioned.canReplace(temporary)); Set<String> originalFiles = Sets.newHashSet( Iterators.transform(partitioned.pathIterator(), new GetFilename())); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temporary.pathIterator(), new GetFilename())); Assert.assertEquals("Sanity check", originalFiles.size(), replacementFiles.size()); Assert.assertFalse("Sanity check", originalFiles.equals(replacementFiles)); partitioned.replace(partitioned, temporary); Set<String> replacedFiles = Sets.newHashSet( Iterators.transform(partitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", replacementFiles, replacedFiles); }
@Test public void testRestrictedRead() throws IOException { FileSystemPartitionView<TestRecord> partition0 = partitioned .getPartitionView(URI.create("id_hash=0")); FileSystemPartitionView<TestRecord> partition1 = partitioned .getPartitionView(URI.create("id_hash=1")); FileSystemPartitionView<TestRecord> partition2 = partitioned .getPartitionView(URI.create("id_hash=2")); FileSystemPartitionView<TestRecord> partition3 = partitioned .getPartitionView(URI.create("id_hash=3")); local.rename( new Path(partition1.getLocation()), new Path(partitioned.getDirectory(), "0")); local.rename( new Path(partition2.getLocation()), new Path(partitioned.getDirectory(), "hash=0")); local.rename( new Path(partition3.getLocation()), new Path(partitioned.getDirectory(), "id_hash=00"));
@Override public void run() { ds.keyFromDirectory(new Path(leafPath, "extra_dir")); } });
@Test public void testCoveringPartitions() { Iterable<PartitionView<TestRecord>> partitions = unpartitioned .getCoveringPartitions(); Assert.assertEquals("Should have a single partition view at the root", unpartitioned.getPartitionView(URI.create( "file:/tmp/datasets/unpartitioned")), Iterables.getOnlyElement(partitions)); partitions = partitioned.getCoveringPartitions(); Set<PartitionView<TestRecord>> expected = Sets.newHashSet(); expected.add(partitioned.getPartitionView(URI.create( "file:/tmp/datasets/partitioned/id_hash=0"))); expected.add(partitioned.getPartitionView(new Path( "file:/tmp/datasets/partitioned/id_hash=1"))); expected.add(partitioned.getPartitionView(URI.create( "file:/tmp/datasets/partitioned/id_hash=2"))); expected.add(partitioned.getPartitionView(new Path( "file:/tmp/datasets/partitioned/id_hash=3"))); Assert.assertEquals("Should have a partition view for each partition", expected, Sets.newHashSet(partitions)); PartitionView<TestRecord> partition0 = partitioned.getPartitionView( URI.create("file:/tmp/datasets/partitioned/id_hash=0")); partition0.deleteAll(); expected.remove(partition0); Assert.assertEquals("Should have a partition view for each partition", expected, Sets.newHashSet(partitions)); }
FileSystemPartitionView<E> dest = getPartitionView( ((FileSystemPartitionView<E>) src).getRelativeLocation()); new Path(unbounded.getLocation().toString()), "replace" /* data should replace to recover from a failure */ ); deleteAll(); // remove all existing files FileSystemUtil.finishMove(fileSystem, staged);
@Override public long getSize() { long size = 0; for (Iterator<Path> i = dirIterator(); i.hasNext(); ) { Path dir = i.next(); try { for (FileStatus st : fileSystem.listStatus(dir)) { size += st.getLen(); } } catch (IOException e) { throw new DatasetIOException("Cannot find size of " + dir, e); } } return size; }
public Path getDirectory(Dataset<?> dataset) { if (dataset instanceof FileSystemDataset) { return ((FileSystemDataset<?>) dataset).getDirectory(); } return null; }