/** * Continuously watches for new files matching the filepattern, polling it at the given * interval, until the given termination condition is reached. The returned {@link PCollection} * is unbounded. * * <p>This works only in runners supporting {@link Kind#SPLITTABLE_DO_FN}. */ @Experimental(Kind.SPLITTABLE_DO_FN) public Read<T> watchForNewFiles( Duration pollInterval, TerminationCondition<String, ?> terminationCondition) { return withMatchConfiguration( getMatchConfiguration().continuously(pollInterval, terminationCondition)); }
/** * Matches a filepattern using {@link FileSystems#match} and produces a collection of matched * resources (both files and directories) as {@link MatchResult.Metadata}. * * <p>By default, matches the filepattern once and produces a bounded {@link PCollection}. To * continuously watch the filepattern for new matches, use {@link MatchAll#continuously(Duration, * TerminationCondition)} - this will produce an unbounded {@link PCollection}. * * <p>By default, a filepattern matching no resources is treated according to {@link * EmptyMatchTreatment#DISALLOW}. To configure this behavior, use {@link * Match#withEmptyMatchTreatment}. * * <p>Returned {@link MatchResult.Metadata} are deduplicated by filename. For example, if this * transform observes a file with the same name several times with different metadata (e.g. * because the file is growing), it will emit the metadata the first time this file is observed, * and will ignore future changes to this file. */ public static Match match() { return new AutoValue_FileIO_Match.Builder() .setConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .build(); }
@Override public PCollection<T> expand(PBegin input) { checkNotNull(getFilepattern(), "filepattern"); checkNotNull(getSchema(), "schema"); if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) { return input.apply( "Read", org.apache.beam.sdk.io.Read.from( createSource( getFilepattern(), getMatchConfiguration().getEmptyMatchTreatment(), getRecordClass(), getSchema()))); } // All other cases go through ReadAll. ReadAll<T> readAll = (getRecordClass() == GenericRecord.class) ? (ReadAll<T>) readAllGenericRecords(getSchema()) : readAll(getRecordClass()); readAll = readAll.withMatchConfiguration(getMatchConfiguration()); return input .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) .apply("Via ReadAll", readAll); }
@Override public PCollection<MatchResult.Metadata> expand(PCollection<String> input) { PCollection<MatchResult.Metadata> res; if (getConfiguration().getWatchInterval() == null) { res = input.apply( "Match filepatterns", ParDo.of(new MatchFn(getConfiguration().getEmptyMatchTreatment()))); } else { res = input .apply( "Continuously match filepatterns", Watch.growthOf( Contextful.of(new MatchPollFn(), Requirements.empty()), new ExtractFilenameFn()) .withPollInterval(getConfiguration().getWatchInterval()) .withTerminationPerInput(getConfiguration().getWatchTerminationCondition())) .apply(Values.create()); } return res.apply(Reshuffle.viaRandomKey()); }
/** * A {@link PTransform} that works like {@link #read}, but reads each file in a {@link * PCollection} of filepatterns. * * <p>Can be applied to both bounded and unbounded {@link PCollection PCollections}, so this is * suitable for reading a {@link PCollection} of filepatterns arriving as a stream. However, every * filepattern is expanded once at the moment it is processed, rather than watched for new files * matching the filepattern to appear. Likewise, every file is read once, rather than watched for * new entries. */ public static ReadAll readAll() { return new AutoValue_TextIO_ReadAll.Builder() .setCompression(Compression.AUTO) .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .build(); }
@Override public PCollection<String> expand(PBegin input) { checkNotNull(getFilepattern(), "need to set the filepattern of a TextIO.Read transform"); if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) { return input.apply("Read", org.apache.beam.sdk.io.Read.from(getSource())); } // All other cases go through ReadAll. return input .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) .apply( "Via ReadAll", readAll() .withCompression(getCompression()) .withMatchConfiguration(getMatchConfiguration()) .withDelimiter(getDelimiter())); }
@Override public PCollection<T> expand(PBegin input) { checkNotNull(getFilepattern(), "filepattern"); Coder<T> coder = inferCoder(getCoder(), getParseFn(), input.getPipeline().getCoderRegistry()); if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) { return input.apply( org.apache.beam.sdk.io.Read.from( AvroSource.from(getFilepattern()).withParseFn(getParseFn(), coder))); } // All other cases go through ParseAllGenericRecords. return input .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) .apply( "Via ParseAll", parseAllGenericRecords(getParseFn()) .withCoder(coder) .withMatchConfiguration(getMatchConfiguration())); }
/** Like {@link Read#withEmptyMatchTreatment}. */ public Parse<T> withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); }
/** Configures whether or not a filepattern matching no files is allowed. */ public Read<T> withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); }
/** Like {@link Match#withEmptyMatchTreatment}. */ public MatchAll withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withConfiguration(getConfiguration().withEmptyMatchTreatment(treatment)); }
/** See {@link MatchConfiguration#withEmptyMatchTreatment(EmptyMatchTreatment)}. */ public Match withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withConfiguration(getConfiguration().withEmptyMatchTreatment(treatment)); }
/** Same as {@link Read#withEmptyMatchTreatment}. */ public ReadAll withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); }
/** See {@link MatchConfiguration#withEmptyMatchTreatment}. */ public Read withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); }
/** * Like {@link #match}, but matches each filepattern in a collection of filepatterns. * * <p>Resources are not deduplicated between filepatterns, i.e. if the same resource matches * multiple filepatterns, it will be produced multiple times. * * <p>By default, a filepattern matching no resources is treated according to {@link * EmptyMatchTreatment#ALLOW_IF_WILDCARD}. To configure this behavior, use {@link * MatchAll#withEmptyMatchTreatment}. */ public static MatchAll matchAll() { return new AutoValue_FileIO_MatchAll.Builder() .setConfiguration(MatchConfiguration.create(EmptyMatchTreatment.ALLOW_IF_WILDCARD)) .build(); }
/** Reads Avro file(s) containing records of the specified schema. */ public static Read<GenericRecord> readGenericRecords(Schema schema) { return new AutoValue_AvroIO_Read.Builder<GenericRecord>() .setMatchConfiguration(MatchConfiguration.create(EmptyMatchTreatment.DISALLOW)) .setRecordClass(GenericRecord.class) .setSchema(schema) .setHintMatchesManyFiles(false) .build(); }
/** * See {@link MatchConfiguration#continuously}. * * <p>This works only in runners supporting {@link Kind#SPLITTABLE_DO_FN}. */ @Experimental(Kind.SPLITTABLE_DO_FN) public Read watchForNewFiles( Duration pollInterval, TerminationCondition<String, ?> terminationCondition) { return withMatchConfiguration( getMatchConfiguration().continuously(pollInterval, terminationCondition)); }
/** Like {@link Match#continuously}. */ @Experimental(Experimental.Kind.SPLITTABLE_DO_FN) public MatchAll continuously( Duration pollInterval, TerminationCondition<String, ?> terminationCondition) { return withConfiguration(getConfiguration().continuously(pollInterval, terminationCondition)); }
/** * Continuously watches for new files at the given interval until the given termination * condition is reached, where the input to the condition is the filepattern. */ public MatchConfiguration continuously( Duration interval, TerminationCondition<String, ?> condition) { return toBuilder().setWatchInterval(interval).setWatchTerminationCondition(condition).build(); }
/** Like {@link Read#watchForNewFiles}. */ @Experimental(Kind.SPLITTABLE_DO_FN) public ParseAll<T> watchForNewFiles( Duration pollInterval, TerminationCondition<String, ?> terminationCondition) { return withMatchConfiguration( getMatchConfiguration().continuously(pollInterval, terminationCondition)); }
/** Like {@link Read#watchForNewFiles}. */ @Experimental(Kind.SPLITTABLE_DO_FN) public ReadAll<T> watchForNewFiles( Duration pollInterval, TerminationCondition<String, ?> terminationCondition) { return withMatchConfiguration( getMatchConfiguration().continuously(pollInterval, terminationCondition)); }