@Override public PCollection<ParseResult> expand(PBegin input) { return input .apply(FileIO.match().filepattern(getFilepattern())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(parseFiles()); } }
@Test public void testParseAndParseFiles() throws IOException { Path root = Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent(); List<ParseResult> expected = Arrays.asList( ParseResult.success( root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()), ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE)); PCollection<ParseResult> parse = p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString())) .apply("FilterParse", ParDo.of(new FilterMetadataFn())); PAssert.that(parse).containsInAnyOrder(expected); PCollection<ParseResult> parseFiles = p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(TikaIO.parseFiles()) .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn())); PAssert.that(parseFiles).containsInAnyOrder(expected); p.run(); }
@Override public PCollection<String> expand(PCollection<String> input) { return input .apply(FileIO.matchAll().withConfiguration(getMatchConfiguration())) .apply( FileIO.readMatches() .withCompression(getCompression()) .withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) .apply(readFiles().withDelimiter(getDelimiter())); }
@Override public PCollection<T> expand(PCollection<String> input) { checkNotNull(getSchema(), "schema"); return input .apply(FileIO.matchAll().withConfiguration(getMatchConfiguration())) .apply(FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) .apply( "Read all via FileBasedSource", new ReadAllViaFileBasedSource<>( getDesiredBundleSizeBytes(), new CreateSourceFn<>(getRecordClass(), getSchema().toString()), AvroCoder.of(getRecordClass(), getSchema()))); }
@Test @Category(NeedsRunner.class) public void testReadFiles() throws IOException { Path tempFolderPath = tempFolder.getRoot().toPath(); writeToFile(TINY, tempFolder, "readAllTiny1.zip", ZIP); writeToFile(TINY, tempFolder, "readAllTiny2.txt", UNCOMPRESSED); writeToFile(LARGE, tempFolder, "readAllLarge1.zip", ZIP); writeToFile(LARGE, tempFolder, "readAllLarge2.txt", UNCOMPRESSED); PCollection<String> lines = p.apply( Create.of( tempFolderPath.resolve("readAllTiny*").toString(), tempFolderPath.resolve("readAllLarge*").toString())) .apply(FileIO.matchAll()) .apply(FileIO.readMatches().withCompression(AUTO)) .apply(TextIO.readFiles().withDesiredBundleSizeBytes(10)); PAssert.that(lines).containsInAnyOrder(Iterables.concat(TINY, TINY, LARGE, LARGE)); p.run(); }
private static void runReadPipeline(Options options) { Pipeline pipeline = Pipeline.create(options); pipeline .apply("Find files", FileIO.match().filepattern(options.getInput())) .apply("Read matched files", FileIO.readMatches()) .apply("Read parquet files", ParquetIO.readFiles(SCHEMA)) .apply("Map records to strings", MapElements.into(strings()).via(new GetRecordsFn())); pipeline.run(); }
@Override public PCollection<T> expand(PCollection<String> input) { final Coder<T> coder = Parse.inferCoder(getCoder(), getParseFn(), input.getPipeline().getCoderRegistry()); final SerializableFunction<GenericRecord, T> parseFn = getParseFn(); final SerializableFunction<String, FileBasedSource<T>> createSource = new CreateParseSourceFn<>(parseFn, coder); return input .apply(FileIO.matchAll().withConfiguration(getMatchConfiguration())) .apply(FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)) .apply( "Parse all via FileBasedSource", new ReadAllViaFileBasedSource<>(getDesiredBundleSizeBytes(), createSource, coder)); }
matches.apply("Read AUTO", FileIO.readMatches().withCompression(Compression.AUTO)); PCollection<FileIO.ReadableFile> decompressedDefault = matches.apply("Read default", FileIO.readMatches()); PCollection<FileIO.ReadableFile> decompressedUncompressed = matches.apply( "Read UNCOMPRESSED", FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)); for (PCollection<FileIO.ReadableFile> c : Arrays.asList(decompressedAuto, decompressedDefault, decompressedUncompressed)) { p.apply("Match GZ", FileIO.match().filepattern(pathGZ)); PCollection<FileIO.ReadableFile> compressionAuto = matchesGZ.apply("Read GZ AUTO", FileIO.readMatches().withCompression(Compression.AUTO)); PCollection<FileIO.ReadableFile> compressionDefault = matchesGZ.apply("Read GZ default", FileIO.readMatches()); PCollection<FileIO.ReadableFile> compressionGzip = matchesGZ.apply("Read GZ GZIP", FileIO.readMatches().withCompression(Compression.GZIP)); for (PCollection<FileIO.ReadableFile> c : Arrays.asList(compressionAuto, compressionDefault, compressionGzip)) {
.apply(Values.create()) .apply(FileIO.matchAll()) .apply(FileIO.readMatches()) .apply( XmlIO.<Bird>readFiles()