private static void copyToLocal(Metadata src, Path dst) throws IOException { FileChannel dstCh = FileChannel.open( dst, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW); ReadableByteChannel srcCh = FileSystems.open(src.resourceId()); long srcSize = src.sizeBytes(); long copied = 0; do { copied += dstCh.transferFrom(srcCh, copied, srcSize - copied); } while (copied < srcSize); dstCh.close(); srcCh.close(); Preconditions.checkState(copied == srcSize); }
/** * Check if total number of files is correct by comparing with the number that is parsed from * shard name using a name template. If no template is specified, "SSSS-of-NNNN" will be used as * default, and "NNNN" will be the expected total number of files. * * @return {@code true} if at least one shard name matches template and total number of given * files equals the number that is parsed from shard name. */ @VisibleForTesting boolean checkTotalNumOfFiles(Collection<Metadata> files) { for (Metadata fileMedadata : files) { String fileName = fileMedadata.resourceId().getFilename(); if (fileName == null) { // this path has zero elements continue; } Matcher matcher = shardTemplate.matcher(fileName); if (!matcher.matches()) { // shard name doesn't match the pattern, check with the next shard continue; } // once match, extract total number of shards and compare to file list return files.size() == Integer.parseInt(matcher.group("numshards")); } return false; } }
/** * Reads all the lines of all the files. * * <p>Not suitable for use except in testing of small data, since the data size may be far more * than can be reasonably processed serially, in-memory, by a single thread. */ @VisibleForTesting List<String> readLines(Collection<Metadata> files) throws IOException { List<String> allLines = Lists.newArrayList(); int i = 1; for (Metadata file : files) { try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) { List<String> lines = CharStreams.readLines(reader); allLines.addAll(lines); LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file); } i++; } return allLines; } }
/** * Reads all the lines of all the files. * * <p>Not suitable for use except in testing of small data, since the data size may be far more * than can be reasonably processed serially, in-memory, by a single thread. */ @VisibleForTesting List<String> readLines(Collection<Metadata> files) throws IOException { List<String> allLines = Lists.newArrayList(); int i = 1; for (Metadata file : files) { try (Reader reader = Channels.newReader(FileSystems.open(file.resourceId()), StandardCharsets.UTF_8.name())) { List<String> lines = CharStreams.readLines(reader); allLines.addAll(lines); LOG.debug("[{} of {}] Read {} lines from file: {}", i, files.size(), lines.size(), file); } i++; } return allLines; }
@Test public void testSchemaStringIsInterned() throws Exception { List<Bird> birds = createRandomRecords(100); String filename = generateTestFile( "tmp.avro", birds, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), DataFileConstants.NULL_CODEC); Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename); String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString(); // Add "" to the schema to make sure it is not interned. AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema); AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema); assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString()); // Ensure that deserialization still goes through interning AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB); assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString()); }
@Override protected final boolean startImpl() throws IOException { FileBasedSource<T> source = getCurrentSource(); this.channel = FileSystems.open(source.getSingleFileMetadata().resourceId()); if (channel instanceof SeekableByteChannel) { SeekableByteChannel seekChannel = (SeekableByteChannel) channel; seekChannel.position(source.getStartOffset()); } else { // Channel is not seekable. Must not be a subrange. checkArgument( source.mode != Mode.SINGLE_FILE_OR_SUBRANGE, "Subrange-based sources must only be defined for file types that support seekable " + " read channels"); checkArgument( source.getStartOffset() == 0, "Start offset %s is not zero but channel for reading the file is not seekable.", source.getStartOffset()); } startReading(channel); // Advance once to load the first record. return advanceImpl(); }
@ProcessElement public void processElement(ProcessContext c) { try { KV<String, String> kv = c.element(); String filePath = GcsUtil.joinPath(importDirectory.get(), kv.getValue()); MatchResult match = FileSystems.match(filePath, EmptyMatchTreatment.DISALLOW); ResourceId resourceId = match.metadata().get(0).resourceId(); TableManifest.Builder builder = TableManifest.newBuilder(); try (InputStream stream = Channels.newInputStream(FileSystems.open(resourceId))) { Reader reader = new InputStreamReader(stream); JsonFormat.parser().merge(reader, builder); } c.output(KV.of(kv.getKey(), builder.build())); } catch (IOException e) { throw new RuntimeException(e); } } }));
@ProcessElement public void processElement(ProcessContext context) { ResourceId inputFile = context.element().resourceId(); // Output a record to the failure file if the file doesn't match a known compression. if (!Compression.AUTO.isCompressed(inputFile.toString())) { String errorMsg = String.format(UNCOMPRESSED_ERROR_MSG, inputFile.toString(), SUPPORTED_COMPRESSIONS); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), errorMsg)); } else { try { ResourceId outputFile = decompress(inputFile); context.output(outputFile.toString()); } catch (IOException e) { LOG.error(e.getMessage()); context.output(DEADLETTER_TAG, KV.of(inputFile.toString(), e.getMessage())); } } }
@ProcessElement public void process(ProcessContext c) throws IOException { ReadableFile file = c.element().getKey(); OffsetRange range = c.element().getValue(); FileBasedSource<T> source = CompressedSource.from(createSource.apply(file.getMetadata().resourceId().toString())) .withCompression(file.getCompression()); try (BoundedSource.BoundedReader<T> reader = source .createForSubrangeOfFile(file.getMetadata(), range.getFrom(), range.getTo()) .createReader(c.getPipelineOptions())) { for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } } }
@Test public void testReadMetadataWithCodecs() throws Exception { // Test reading files generated using all codecs. String[] codecs = { DataFileConstants.NULL_CODEC, DataFileConstants.BZIP2_CODEC, DataFileConstants.DEFLATE_CODEC, DataFileConstants.SNAPPY_CODEC, DataFileConstants.XZ_CODEC }; List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); for (String codec : codecs) { String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); assertEquals(codec, metadata.getCodec()); } }
private List<String> toFilenames(List<MatchResult> matchResults) { return FluentIterable.from(matchResults) .transformAndConcat( matchResult -> { try { return matchResult.metadata(); } catch (IOException e) { throw new RuntimeException(e); } }) .transform(metadata -> ((LocalResourceId) metadata.resourceId()).getPath().toString()) .toList(); } }
String readFromFile(String passwordFile) throws IOException { MatchResult.Metadata m = FileSystems.matchSingleFileSpec(passwordFile); LOGGER.info("Reading password from file: {}", m.resourceId().toString()); InputStream inputStream = Channels.newInputStream(FileSystems.open(m.resourceId())); return CharStreams.toString(new InputStreamReader(inputStream, Charsets.UTF_8)); }
@Override public void encode(Metadata value, OutputStream os) throws IOException { RESOURCE_ID_CODER.encode(value.resourceId(), os); INT_CODER.encode(value.isReadSeekEfficient() ? 1 : 0, os); LONG_CODER.encode(value.sizeBytes(), os); }
@Override public String apply(MatchResult.Metadata input) { return input.resourceId().toString(); } }
/** * Returns a {@link SeekableByteChannel} equivalent to {@link #open}, but fails if this file is * not {@link MatchResult.Metadata#isReadSeekEfficient seekable}. */ public SeekableByteChannel openSeekable() throws IOException { checkState( getMetadata().isReadSeekEfficient(), "The file %s is not seekable", metadata.resourceId()); return (SeekableByteChannel) open(); }
/** * Returns a {@link ReadableByteChannel} reading the data from this file, potentially * decompressing it using {@link #getCompression}. */ public ReadableByteChannel open() throws IOException { return compression.readDecompressed(FileSystems.open(metadata.resourceId())); }
@Test public void testReadSchemaString() throws Exception { List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT); String codec = DataFileConstants.NULL_CODEC; String filename = generateTestFile( codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec); Metadata fileMeta = FileSystems.matchSingleFileSpec(filename); AvroMetadata metadata = AvroSource.readMetadataFromFile(fileMeta.resourceId()); // By default, parse validates the schema, which is what we want. Schema schema = new Schema.Parser().parse(metadata.getSchemaString()); assertEquals(4, schema.getFields().size()); }
@Setup public void setup() throws Exception { if (spec.getTikaConfigPath() != null) { ResourceId configResource = FileSystems.matchSingleFileSpec(spec.getTikaConfigPath().get()).resourceId(); tikaConfig = new TikaConfig(Channels.newInputStream(FileSystems.open(configResource))); } }
private List<String> toFilenames(MatchResult matchResult) throws IOException { return FluentIterable.from(matchResult.metadata()) .transform(metadata -> ((GcsResourceId) metadata.resourceId()).getGcsPath().toString()) .toList(); } }
public MatchResult.Metadata apply(MatchResult.Metadata metadata) { ResourceId resourceId = metadata.resourceId(); System.out.println(resourceId.getScheme() + "://" + resourceId.getCurrentDirectory() + resourceId.getFilename()); return metadata; } }));