FileSystems.match(Collections.singletonList(pattern)).get(0).metadata(); for (Metadata meta : metadata) { outputFiles.add(new File(meta.resourceId().toString()));
@Override protected List<MatchResult> match(List<String> specs) { ImmutableList.Builder<MatchResult> resultsBuilder = ImmutableList.builder(); for (String spec : specs) { try { FileStatus[] fileStatuses = fileSystem.globStatus(new Path(spec)); if (fileStatuses == null) { resultsBuilder.add(MatchResult.create(Status.NOT_FOUND, Collections.emptyList())); continue; } List<Metadata> metadata = new ArrayList<>(); for (FileStatus fileStatus : fileStatuses) { if (fileStatus.isFile()) { URI uri = dropEmptyAuthority(fileStatus.getPath().toUri().toString()); metadata.add( Metadata.builder() .setResourceId(new HadoopResourceId(uri)) .setIsReadSeekEfficient(true) .setSizeBytes(fileStatus.getLen()) .build()); } } resultsBuilder.add(MatchResult.create(Status.OK, metadata)); } catch (IOException e) { resultsBuilder.add(MatchResult.create(Status.ERROR, e)); } } return resultsBuilder.build(); }
List<FileBasedSource<T>> splitResults = new ArrayList<>(expandedFiles.size()); for (Metadata metadata : expandedFiles) { FileBasedSource<T> split = createForSubrangeOfFile(metadata, 0, metadata.sizeBytes()); verify( split.getMode() == Mode.SINGLE_FILE_OR_SUBRANGE,
.satisfies( input -> { assertEquals(path, input.getMetadata().resourceId().toString()); assertEquals("Hello world".length(), input.getMetadata().sizeBytes()); assertEquals(Compression.UNCOMPRESSED, input.getCompression()); assertTrue(input.getMetadata().isReadSeekEfficient()); try { assertEquals("Hello world", input.readFullyAsUTF8String()); .satisfies( input -> { assertEquals(pathGZ, input.getMetadata().resourceId().toString()); assertFalse(input.getMetadata().sizeBytes() == "Hello world".length()); assertEquals(Compression.GZIP, input.getCompression()); assertFalse(input.getMetadata().isReadSeekEfficient()); try { assertEquals("Hello world", input.readFullyAsUTF8String());
FileSystems.match(Collections.singletonList(tempDir.toString() + "*"))); for (Metadata matchResult : singleMatch.metadata()) { if (allMatches.add(matchResult.resourceId())) { LOG.info("Will also remove unknown temporary file {}", matchResult.resourceId());
long srcSize = srcMeta.sizeBytes(); boolean shouldDownload = true; if (Files.exists(dst)) {
List<MatchResult> matches = FileSystems.match(Collections.singletonList(pattern)); for (Metadata expectedFile : Iterables.getOnlyElement(matches).metadata()) { expectedFiles.add(new File(expectedFile.resourceId().toString()));
List<String> files = new ArrayList<>(strs.size()); for (Metadata match : matches) { String filename = match.resourceId().toString(); files.add(filename); CharBuffer buf = CharBuffer.allocate((int) new File(filename).length());
.metadata() .stream() .filter(metadata -> metadata.resourceId().getFilename().endsWith(".js")) .map(Metadata::resourceId) .map(
@Override public final BoundedReader<T> createReader(PipelineOptions options) throws IOException { // Validate the current source prior to creating a reader for it. this.validate(); String fileOrPattern = fileOrPatternSpec.get(); if (mode == Mode.FILEPATTERN) { long startTime = System.currentTimeMillis(); List<Metadata> fileMetadata = FileSystems.match(fileOrPattern, emptyMatchTreatment).metadata(); LOG.info("Matched {} files for pattern {}", fileMetadata.size(), fileOrPattern); List<FileBasedReader<T>> fileReaders = new ArrayList<>(); for (Metadata metadata : fileMetadata) { long endOffset = metadata.sizeBytes(); fileReaders.add( createForSubrangeOfFile(metadata, 0, endOffset).createSingleFileReader(options)); } LOG.debug( "Creating a reader for file pattern {} took {} ms", fileOrPattern, System.currentTimeMillis() - startTime); if (fileReaders.size() == 1) { return fileReaders.get(0); } return new FilePatternReader(this, fileReaders); } else { return createSingleFileReader(options); } }
@ProcessElement public void processElement(ProcessContext context) { ResourceId inputFile = context.element().resourceId(); Compression compression = compressionValue.get(); // Add the compression extension to the output filename. Example: demo.txt -> demo.txt.gz String outputFilename = inputFile.getFilename() + compression.getSuggestedSuffix(); // Resolve the necessary resources to perform the transfer ResourceId outputDir = FileSystems.matchNewResource(destinationLocation.get(), true); ResourceId outputFile = outputDir.resolve(outputFilename, StandardResolveOptions.RESOLVE_FILE); ResourceId tempFile = outputDir.resolve("temp-" + outputFilename, StandardResolveOptions.RESOLVE_FILE); // Perform the copy of the compressed channel to the destination. try (ReadableByteChannel readerChannel = FileSystems.open(inputFile)) { try (WritableByteChannel writerChannel = compression.writeCompressed(FileSystems.create(tempFile, MimeTypes.BINARY))) { // Execute the copy to the temporary file ByteStreams.copy(readerChannel, writerChannel); } // Rename the temporary file to the output file FileSystems.rename(ImmutableList.of(tempFile), ImmutableList.of(outputFile)); // Output the path to the uncompressed file context.output(outputFile.toString()); } catch (IOException e) { LOG.error("Error occurred during compression of {}", inputFile.toString(), e); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage())); } } }
/** * Create a {@code FileBasedSource} based on a single file. This constructor must be used when * creating a new {@code FileBasedSource} for a subrange of a single file. Additionally, this * constructor must be used to create new {@code FileBasedSource}s when subclasses implement the * method {@link #createForSubrangeOfFile}. * * <p>See {@link OffsetBasedSource} for detailed descriptions of {@code minBundleSize}, {@code * startOffset}, and {@code endOffset}. * * @param fileMetadata specification of the file represented by the {@link FileBasedSource}, in * suitable form for use with {@link FileSystems#match(List)}. * @param minBundleSize minimum bundle size in bytes. * @param startOffset starting byte offset. * @param endOffset ending byte offset. If the specified value {@code >= #getMaxEndOffset()} it * implies {@code #getMaxEndOffSet()}. */ protected FileBasedSource( Metadata fileMetadata, long minBundleSize, long startOffset, long endOffset) { super(startOffset, endOffset, minBundleSize); mode = Mode.SINGLE_FILE_OR_SUBRANGE; this.singleFileMetadata = checkNotNull(fileMetadata, "fileMetadata"); this.fileOrPatternSpec = StaticValueProvider.of(fileMetadata.resourceId().toString()); // This field will be unused in this mode. this.emptyMatchTreatment = EmptyMatchTreatment.DISALLOW; }
@Override protected void startReading(ReadableByteChannel channel) throws IOException { try { metadata = readMetadataFromFile(getCurrentSource().getSingleFileMetadata().resourceId()); } catch (IOException e) { throw new RuntimeException( "Error reading metadata from file " + getCurrentSource().getSingleFileMetadata(), e); } long startOffset = getCurrentSource().getStartOffset(); byte[] syncMarker = metadata.getSyncMarker(); long syncMarkerLength = syncMarker.length; if (startOffset != 0) { // Rewind order to find the sync marker ending the previous block. long position = Math.max(0, startOffset - syncMarkerLength); ((SeekableByteChannel) channel).position(position); startOffset = position; } // Satisfy the post condition. stream = createStream(channel); countStream = new CountingInputStream(stream); synchronized (progressLock) { currentBlockOffset = startOffset + advancePastNextSyncMarker(stream, syncMarker); currentBlockSizeBytes = 0; } }
@ProcessElement public void processElement(ProcessContext c) throws Exception { ReadableFile file = c.element(); InputStream stream = Channels.newInputStream(file.open()); try (InputStream tikaStream = TikaInputStream.get(stream)) { Parser parser = tikaConfig == null ? new AutoDetectParser() : new AutoDetectParser(tikaConfig); ParseContext context = new ParseContext(); context.set(Parser.class, parser); Metadata tikaMetadata = spec.getInputMetadata() != null ? spec.getInputMetadata() : new Metadata(); if (spec.getContentTypeHint() != null) { tikaMetadata.set(Metadata.CONTENT_TYPE, spec.getContentTypeHint()); } String location = file.getMetadata().resourceId().toString(); ParseResult res; ContentHandler tikaHandler = new ToTextContentHandler(); try { parser.parse(tikaStream, tikaHandler, tikaMetadata, context); res = ParseResult.success(location, tikaHandler.toString(), tikaMetadata); } catch (Exception e) { res = ParseResult.failure(location, tikaHandler.toString(), tikaMetadata, e); } c.output(res); } } }
@ProcessElement public void process(ProcessContext c) { MatchResult.Metadata metadata = c.element(); if (metadata.resourceId().isDirectory()) { switch (spec.getDirectoryTreatment()) { case SKIP: return; case PROHIBIT: throw new IllegalArgumentException( "Trying to read " + metadata.resourceId() + " which is a directory"); default: throw new UnsupportedOperationException( "Unknown DirectoryTreatment: " + spec.getDirectoryTreatment()); } } Compression compression = (spec.getCompression() == Compression.AUTO) ? Compression.detect(metadata.resourceId().getFilename()) : spec.getCompression(); c.output( new ReadableFile( MatchResult.Metadata.builder() .setResourceId(metadata.resourceId()) .setSizeBytes(metadata.sizeBytes()) .setIsReadSeekEfficient( metadata.isReadSeekEfficient() && compression == Compression.UNCOMPRESSED) .build(), compression)); } }
/** * Check if total number of files is correct by comparing with the number that is parsed from * shard name using a name template. If no template is specified, "SSSS-of-NNNN" will be used as * default, and "NNNN" will be the expected total number of files. * * @return {@code true} if at least one shard name matches template and total number of given * files equals the number that is parsed from shard name. */ @VisibleForTesting boolean checkTotalNumOfFiles(Collection<Metadata> files) { for (Metadata fileMedadata : files) { String fileName = fileMedadata.resourceId().getFilename(); if (fileName == null) { // this path has zero elements continue; } Matcher matcher = shardTemplate.matcher(fileName); if (!matcher.matches()) { // shard name doesn't match the pattern, check with the next shard continue; } // once match, extract total number of shards and compare to file list return files.size() == Integer.parseInt(matcher.group("numshards")); } return false; } }
/** * Reads all the lines of all the files. * * <p>Not suitable for use except in testing of small data, since the data size may be far more * than can be reasonably processed serially, in-memory, by a single thread. */ @VisibleForTesting List<String> readLines(Collection<Metadata> files) throws IOException { List<String> allLines = Lists.newArrayList(); int i = 1; for (Metadata file : files) { try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) { List<String> lines = CharStreams.readLines(reader); allLines.addAll(lines); LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file); } i++; } return allLines; } }
/** * Reads all the lines of all the files. * * <p>Not suitable for use except in testing of small data, since the data size may be far more * than can be reasonably processed serially, in-memory, by a single thread. */ @VisibleForTesting List<String> readLines(Collection<Metadata> files) throws IOException { List<String> allLines = Lists.newArrayList(); int i = 1; for (Metadata file : files) { try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) { List<String> lines = CharStreams.readLines(reader); allLines.addAll(lines); LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file); } i++; } return allLines; }
/** * Reads all the lines of all the files. * * <p>Not suitable for use except in testing of small data, since the data size may be far more * than can be reasonably processed serially, in-memory, by a single thread. */ @VisibleForTesting List<String> readLines(Collection<Metadata> files) throws IOException { List<String> allLines = Lists.newArrayList(); int i = 1; for (Metadata file : files) { try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) { List<String> lines = CharStreams.readLines(reader); allLines.addAll(lines); LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file); } i++; } return allLines; } }
@Test public void testSchemaStringIsInterned() throws Exception { List<Bird> birds = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename); String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString(); // Add "" to the schema to make sure it is not interned. AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema); AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema); assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString()); // Ensure that deserialization still goes through interning AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB); assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString()); }