@Override public PCollection<ParseResult> expand(PBegin input) { return input .apply(FileIO.match().filepattern(getFilepattern())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(parseFiles()); } }
PCollection<MatchResult.Metadata> matchMetadata = p.apply( FileIO.match() .filepattern(basePath.resolve("*").toString()) .continuously(
public static void main(String[] args) { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline p = Pipeline.create(options); // final String filePattern = "/home/ismael/test/*.xml"; final String filePattern = "file:///home/ismael/test/*.xml"; p.apply("MatchXml", FileIO.match().filepattern(filePattern)) .apply( MapElements.via( new SimpleFunction<MatchResult.Metadata, MatchResult.Metadata>() { public MatchResult.Metadata apply(MatchResult.Metadata metadata) { ResourceId resourceId = metadata.resourceId(); System.out.println(resourceId.getScheme() + "://" + resourceId.getCurrentDirectory() + resourceId.getFilename()); return metadata; } })); p.run(); } }
p.apply( "Match existing", FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*"))) .containsInAnyOrder(metadata(firstPath, firstSize), metadata(secondPath, secondSize)); PAssert.that( p.apply( "Match existing with provider", FileIO.match() .filepattern(p.newProvider(tmpFolder.getRoot().getAbsolutePath() + "/*")))) .containsInAnyOrder(metadata(firstPath, firstSize), metadata(secondPath, secondSize)); p.apply( "Match non-existing ALLOW", FileIO.match() .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/blah") .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW))) p.apply( "Match non-existing ALLOW_IF_WILDCARD", FileIO.match() .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/blah*") .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW_IF_WILDCARD)))
@Test public void testParseAndParseFiles() throws IOException { Path root = Paths.get(getClass().getResource("/valid/apache-beam-tika.odt").getPath()).getParent(); List<ParseResult> expected = Arrays.asList( ParseResult.success( root.resolve("apache-beam-tika.odt").toString(), ODT_FILE, getOdtMetadata()), ParseResult.success(root.resolve("apache-beam-tika-pdf.zip").toString(), PDF_ZIP_FILE)); PCollection<ParseResult> parse = p.apply("Parse", TikaIO.parse().filepattern(root.resolve("*").toString())) .apply("FilterParse", ParDo.of(new FilterMetadataFn())); PAssert.that(parse).containsInAnyOrder(expected); PCollection<ParseResult> parseFiles = p.apply("ParseFiles", FileIO.match().filepattern(root.resolve("*").toString())) .apply(FileIO.readMatches().withCompression(Compression.UNCOMPRESSED)) .apply(TikaIO.parseFiles()) .apply("FilterParseFiles", ParDo.of(new FilterMetadataFn())); PAssert.that(parseFiles).containsInAnyOrder(expected); p.run(); }
@Test @Category(NeedsRunner.class) public void testMatchDisallowEmptyDefault() throws IOException { p.apply("Match", FileIO.match().filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*")); thrown.expectCause(isA(FileNotFoundException.class)); p.run(); }
@Test @Category(NeedsRunner.class) public void testMatchDisallowEmptyNonWildcard() throws IOException { p.apply( FileIO.match() .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/blah") .withEmptyMatchTreatment(EmptyMatchTreatment.ALLOW_IF_WILDCARD)); thrown.expectCause(isA(FileNotFoundException.class)); p.run(); }
@Override public PCollection<Export> expand(PBegin input) { NestedValueProvider<String, String> manifestFile = NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json")); return input .apply("Read manifest", FileIO.match().filepattern(manifestFile)) .apply( "Resource id", MapElements.into(TypeDescriptor.of(ResourceId.class)) .via((MatchResult.Metadata::resourceId))) .apply( "Read manifest json", MapElements.into(TypeDescriptor.of(Export.class)) .via(ReadExportManifestFile::readManifest)); }
private static void runReadPipeline(Options options) { Pipeline pipeline = Pipeline.create(options); pipeline .apply("Find files", FileIO.match().filepattern(options.getInput())) .apply("Read matched files", FileIO.readMatches()) .apply("Read parquet files", ParquetIO.readFiles(SCHEMA)) .apply("Map records to strings", MapElements.into(strings()).via(new GetRecordsFn())); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testMatchDisallowEmptyExplicit() throws IOException { p.apply( FileIO.match() .filepattern(tmpFolder.getRoot().getAbsolutePath() + "/*") .withEmptyMatchTreatment(EmptyMatchTreatment.DISALLOW)); thrown.expectCause(isA(FileNotFoundException.class)); p.run(); }
.apply("MatchFile(s)", FileIO.match().filepattern(options.getInputFilePattern())) .apply( "DecompressFile(s)",
PCollection<MatchResult.Metadata> matches = p.apply("Match", FileIO.match().filepattern(path)); PCollection<FileIO.ReadableFile> decompressedAuto = matches.apply("Read AUTO", FileIO.readMatches().withCompression(Compression.AUTO)); p.apply("Match GZ", FileIO.match().filepattern(pathGZ)); PCollection<FileIO.ReadableFile> compressionAuto = matchesGZ.apply("Read GZ AUTO", FileIO.readMatches().withCompression(Compression.AUTO));
.apply("Match File(s)", FileIO.match().filepattern(options.getInputFilePattern())) .apply( "Compress File(s)",