private static <E> boolean usePerTaskAttemptDatasets(View<E> target, Configuration conf) { // For performance reasons we should skip the intermediate task attempt and job output datasets if the // file system does not support efficient renaming (such as S3), and write to the target dataset directly. if (!FileSystemUtil.supportsRename(URI.create(target.getUri().getSchemeSpecificPart()), conf)) { return false; } // new API output committers are not called properly in Hadoop 1 return !Hadoop.isHadoop1() && target.getDataset() instanceof Mergeable; }
: fs.delete(absolute, true /* include any files */); deleted |= deleteParentDirectoriesIfEmpty(fs, root, absolute);
boolean deleteAllUnsafe(boolean useTrash) { boolean deleted = false; if (dataset.getDescriptor().isPartitioned()) { for (StorageKey key : partitionIterator()) { deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, key.getPath()) : FileSystemUtil.cleanlyDelete(fs, root, key.getPath())) || deleted; if (listener != null) { // the relative path is the partition name, so we can simply delete it // in Hive listener.partitionDeleted(dataset.getNamespace(), dataset.getName(), key.getPath().toString()); } } } else { for (Path path : pathIterator()) { deleted = (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, root, path) : FileSystemUtil.cleanlyDelete(fs, root, path)) || deleted; } } return deleted; }
/** * Finds potential datasets by crawling a directory tree. * <p> * This method looks for any data files and directories appear to form a * dataset. This deliberately ignores information that may be stored in the * Hive metastore or .metadata folders. * <p> * Recognizes only Avro, Parquet, and JSON data files. * * @param fs a FileSystem for the root path * @param path a root Path that will be searched * @return a Collection with a DatasetDescriptor for each potential dataset. * @throws IOException */ public static Collection<DatasetDescriptor> findPotentialDatasets( FileSystem fs, Path path) throws IOException { List<DatasetDescriptor> descriptors = Lists.newArrayList(); Result result = visit(new FindDatasets(), fs, path); if (result instanceof Result.Table) { descriptors.add(descriptor(fs, (Result.Table) result)); } else if (result instanceof Result.Group) { for (Result.Table table : ((Result.Group) result).tables) { descriptors.add(descriptor(fs, table)); } } return descriptors; }
@Override public Void call() throws IOException { FileSystemUtil.findPotentialDatasets(fs, root); return null; } });
FileSystemUtil.ensureLocationExists(newDescriptor, conf);
private static DatasetDescriptor descriptor(FileSystem fs, Result.Table table) throws IOException { // inspect the path to determine the partition strategy PartitionStrategy strategy = strategy(fs, table.location); DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder() .format(table.format) .schema(table.schema) .partitionStrategy(strategy) .location(table.location); if (table.depth < 0) { builder.property("kite.filesystem.mixed-depth", "true"); } return builder.build(); }
@Test public void testWriteWithOldSchema() throws IOException { Schema writerSchema = SchemaBuilder.record("Message").fields() .requiredLong("id") .requiredString("message") .endRecord(); fsWriter = newWriter(testDirectory, TEST_SCHEMA, writerSchema); init(fsWriter); for (long i = 0; i < 1000; i += 1) { GenericRecordBuilder recordBuilder = new GenericRecordBuilder(TEST_SCHEMA) .set("id", i).set("message","test-"+ i); fsWriter.write(recordBuilder.build()); } fsWriter.close(); final FileStatus[] stats = fs.listStatus(testDirectory, PathFilters.notHidden()); Assert.assertEquals("Should match with writer schema", writerSchema, FileSystemUtil.schema("record", fs, stats[0].getPath())); }
static boolean cleanlyDelete(FileSystem fs, Path root, Path path) { return cleanlyDelete(fs, root, path, false); }
@Test public void testEmptyDirectory() throws IOException { Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
FileSystemUtil.ensureLocationExists(newDescriptor, conf);
static boolean cleanlyMoveToTrash(FileSystem fs, Path root, Path path) { return cleanlyDelete(fs, root, path, true); }
@Test public void testSupportsRenameConfigNotSet() { Assert.assertFalse("Should default to false for S3A", FileSystemUtil.supportsRename(URI.create("s3a://bucket/path"), new Configuration())); Assert.assertFalse("Should default to false for S3N", FileSystemUtil.supportsRename(URI.create("s3n://bucket/path"), new Configuration())); Assert.assertTrue("Should default to true for HDFS", FileSystemUtil.supportsRename(URI.create("hdfs://cluster/path"), new Configuration())); Assert.assertTrue("Should default to true for FILE", FileSystemUtil.supportsRename(URI.create("file:///path"), new Configuration())); }
@Test public void testSingleUnknownFile() throws Exception { File folder = temp.newFolder("a/b/c/d/e"); Path root = new Path(temp.getRoot().toURI()); FileSystem fs = LocalFileSystem.getInstance(); // create a single Avro file Path parent = new Path(folder.toURI()); createUnknownFile(fs, parent); Collection<DatasetDescriptor> expected = Lists.newArrayList(); Assert.assertEquals("Should succeed and find no datasets", expected, FileSystemUtil.findPotentialDatasets(fs, root)); }
FileSystemUtil.ensureLocationExists(newDescriptor, conf);
@Test public void testDeleteParentDirectoriesIfEmptyCatchesFileNotFoundException() throws Exception { FileSystem fs = mock(FileSystem.class); Path root = mock(Path.class); Path path = mock(Path.class); Path firstParent = mock(Path.class); when(path.getParent()).thenReturn(firstParent); when(firstParent.getParent()).thenReturn(root); when(fs.listStatus(firstParent)).thenThrow(FileNotFoundException.class); boolean pathDeleted = FileSystemUtil.deleteParentDirectoriesIfEmpty(fs, root, path); verify(fs).listStatus(firstParent); assertFalse(pathDeleted); }
changed |= (useTrash ? FileSystemUtil.cleanlyMoveToTrash(fs, rootDirectory, dataLocation) : FileSystemUtil.cleanlyDelete(fs, rootDirectory, dataLocation)); } else { try {
public void delete() { FileSystemUtil.cleanlyDelete(fs, root, storage); LOG.debug("Deleted temporary dataset repository with storage {}.", storage); } }
private FileSystemWriter(FileSystem fs, Path path, long rollIntervalMillis, long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) { Preconditions.checkNotNull(fs, "File system is not defined"); Preconditions.checkNotNull(path, "Destination directory is not defined"); Preconditions.checkNotNull(descriptor, "Descriptor is not defined"); this.fs = fs; this.directory = path; this.rollIntervalMillis = rollIntervalMillis; this.targetFileSize = targetFileSize; this.descriptor = descriptor; this.conf = new Configuration(fs.getConf()); this.state = ReaderWriterState.NEW; this.schema = writerSchema; // copy file format settings from custom properties to the Configuration for (String prop : descriptor.listProperties()) { conf.set(prop, descriptor.getProperty(prop)); } // For performance reasons we will skip temp file creation if the file system does not support // efficient renaming, and write the file directly. this.useTempPath = FileSystemUtil.supportsRename(fs.getUri(), conf); }