@Override protected Source.Reader<OutputT> createReader(PipelineOptions options) throws IOException { return source.createReader(options, null); } }
@SuppressWarnings("unchecked") public UnboundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, UnboundedSource<OutputT, CheckpointMarkT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); if (source.requiresDeduping()) { LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source); } Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder(); if (checkpointMarkCoder == null) { LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots."); checkpointCoder = null; } else { Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder = (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() { }); checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder); } // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(parallelism, pipelineOptions); }
@Override public final PCollection<T> expand(PBegin input) { source.validate(); return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED, source.getOutputCoder()); }
private UnboundedReader<OutputT> getReader(UnboundedSourceShard<OutputT, CheckpointMarkT> shard) throws IOException { UnboundedReader<OutputT> existing = shard.getExistingReader(); if (existing == null) { CheckpointMarkT checkpoint = shard.getCheckpoint(); if (checkpoint != null) { checkpoint = CoderUtils.clone(shard.getSource().getCheckpointMarkCoder(), checkpoint); } return shard.getSource().createReader(options, checkpoint); } else { return existing; } }
mkKafkaReadTransform(initialNumElements, new ValueAsTimestampFn()) .makeSource() .split(1, PipelineOptionsFactory.create()) .get(0); UnboundedReader<KafkaRecord<Integer, Long>> reader = source.createReader(null, null); source.getCheckpointMarkCoder(), (KafkaCheckpointMark) reader.getCheckpointMark()); .withTimestampFn(new ValueAsTimestampFn()) .makeSource() .split(1, PipelineOptionsFactory.create()) .get(0); reader = source.createReader(null, mark);
@Override public List<Readable<Object>> getReadables(final int desiredNumOfSplits) throws Exception { final List<Readable<Object>> readables = new ArrayList<>(); source.split(desiredNumOfSplits, null) .forEach(unboundedSource -> readables.add(new UnboundedSourceReadable<>(unboundedSource))); return readables; }
@Override public Collection<CommittedBundle<UnboundedSourceShard<T, ?>>> getInitialInputs( AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> transform, int targetParallelism) throws Exception { UnboundedSource<T, ?> source = ReadTranslation.unboundedSourceFromTransform(transform); List<? extends UnboundedSource<T, ?>> splits = source.split(targetParallelism, options); UnboundedReadDeduplicator deduplicator = source.requiresDeduping() ? UnboundedReadDeduplicator.CachedIdDeduplicator.create() : NeverDeduplicator.create(); ImmutableList.Builder<CommittedBundle<UnboundedSourceShard<T, ?>>> initialShards = ImmutableList.builder(); for (UnboundedSource<T, ?> split : splits) { UnboundedSourceShard<T, ?> shard = UnboundedSourceShard.unstarted(split, deduplicator); initialShards.add( evaluationContext .<UnboundedSourceShard<T, ?>>createRootBundle() .add(WindowedValue.valueInGlobalWindow(shard)) .commit(BoundedWindow.TIMESTAMP_MAX_VALUE)); } return initialShards.build(); } }
/** * Returns a unique identifier for the current record. This should be the same for each instance * of the same logical record read from the underlying data source. * * <p>It is only necessary to override this if {@link #requiresDeduping} has been overridden to * return true. * * <p>For example, this could be a hash of the record contents, or a logical ID present in the * record. If this is generated as a hash of the record contents, it should be at least 16 bytes * (128 bits) to avoid collisions. * * <p>This method has the same restrictions on when it can be called as {@link #getCurrent} and * {@link #getCurrentTimestamp}. * * @throws NoSuchElementException if the reader is at the beginning of the input and {@link * #start} or {@link #advance} wasn't called, or if the last {@link #start} or {@link * #advance} returned {@code false}. */ public byte[] getCurrentRecordId() throws NoSuchElementException { if (getCurrentSource().requiresDeduping()) { throw new IllegalStateException( "getCurrentRecordId() must be overridden if requiresDeduping returns true()"); } return EMPTY; }
public Coder<CheckpointMarkT> getCheckpointMarkCoder() { return source.getCheckpointMarkCoder(); }
@Override public final PCollection<T> expand(PBegin input) { source.validate(); if (source.requiresDeduping()) { return Pipeline.applyTransform(input, new ReadWithIds<>(source)).apply(new Deduplicate<>()); } else { return Pipeline.applyTransform(input, new ReadWithIds<>(source)) .apply("StripIds", ParDo.of(new ValueWithRecordId.StripIdsDoFn<>())); } }
@Override public PCollection<T> expand(PBegin input) { Coder<Shard<T>> shardCoder = SerializableCoder.of((Class<Shard<T>>) (Class) Shard.class); PCollection<ValueWithRecordId<T>> read = input .apply( "Create", Create.of( new AutoValue_BoundedReadFromUnboundedSource_Shard.Builder<T>() .setSource(source) .setMaxNumRecords(maxNumRecords) .setMaxReadTime(maxReadTime) .build()) .withCoder(shardCoder)) .apply("Split", ParDo.of(new SplitFn<>())) .setCoder(shardCoder) .apply("Reshuffle", Reshuffle.viaRandomKey()) .apply("Read", ParDo.of(new ReadFn<>())) .setCoder(ValueWithRecordId.ValueWithRecordIdCoder.of(source.getOutputCoder())); if (source.requiresDeduping()) { read = read.apply( Distinct.<ValueWithRecordId<T>, byte[]>withRepresentativeValueFn( ValueWithRecordId::getId) .withRepresentativeType(TypeDescriptor.of(byte[].class))); } return read.apply("StripIds", ParDo.of(new ValueWithRecordId.StripIdsDoFn<>())) .setCoder(source.getOutputCoder()); }
@Override public Coder<T> getOutputCoder() { return source.getOutputCoder(); }
@Override public void validate() { source.validate(); }
mkKafkaReadTransform(numElements, new ValueAsTimestampFn()) .makeSource() .split(1, PipelineOptionsFactory.create()) .get(0); UnboundedReader<KafkaRecord<Integer, Long>> reader = source.createReader(null, null); final int numToSkip = 20; // one from each partition. source.getCheckpointMarkCoder(), (KafkaCheckpointMark) reader.getCheckpointMark()); reader = source.createReader(null, mark);
@Test public void testUnboundedSourceCheckpointMark() throws Exception { UnboundedSource<Long, CounterMark> source = CountingSource.unboundedWithTimestampFn(new ValueAsTimestampFn()); UnboundedReader<Long> reader = source.createReader(null, null); final long numToSkip = 3; assertTrue(reader.start()); // Advance the source numToSkip elements and manually save state. for (long l = 0; l < numToSkip; ++l) { reader.advance(); } // Confirm that we get the expected element in sequence before checkpointing. assertEquals(numToSkip, (long) reader.getCurrent()); assertEquals(numToSkip, reader.getCurrentTimestamp().getMillis()); // Checkpoint and restart, and confirm that the source continues correctly. CounterMark mark = CoderUtils.clone(source.getCheckpointMarkCoder(), (CounterMark) reader.getCheckpointMark()); reader = source.createReader(null, mark); assertTrue(reader.start()); // Confirm that we get the next element in sequence. assertEquals(numToSkip + 1, (long) reader.getCurrent()); assertEquals(numToSkip + 1, reader.getCurrentTimestamp().getMillis()); } }
List<? extends Source<T>> split(final PipelineOptions options) throws Exception { final List<MicrobatchSource<T, CheckpointMarkT>> result = new ArrayList<>(); final List<? extends UnboundedSource<T, CheckpointMarkT>> splits = source.split(numInitialSplits, options); final int numSplits = splits.size(); final long[] numRecords = splitNumRecords(maxNumRecords, numSplits); for (int i = 0; i < numSplits; i++) { // splits must be stable, and cannot change during consecutive executions // for example: Kafka should not add partitions if more then one topic is read. result.add( new MicrobatchSource<>( splits.get(i), maxReadTime, 1, numRecords[i], i, sourceId, readerCacheInterval)); } return result; }
.returns(withIdTypeInfo); if (rawSource.requiresDeduping()) { source = nonDedupSource
@Override public final PCollection<ValueWithRecordId<T>> expand(PInput input) { return PCollection.createPrimitiveOutputInternal( input.getPipeline(), WindowingStrategy.globalDefault(), IsBounded.UNBOUNDED, ValueWithRecordId.ValueWithRecordIdCoder.of(source.getOutputCoder())); }
@SuppressWarnings("unchecked") public UnboundedSourceWrapper( String stepName, PipelineOptions pipelineOptions, UnboundedSource<OutputT, CheckpointMarkT> source, int parallelism) throws Exception { this.stepName = stepName; this.serializedOptions = new SerializablePipelineOptions(pipelineOptions); if (source.requiresDeduping()) { LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source); } Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder(); if (checkpointMarkCoder == null) { LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots."); checkpointCoder = null; } else { Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder = (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {}); checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder); } // get the splits early. we assume that the generated splits are stable, // this is necessary so that the mapping of state to source is correct // when restoring splitSources = source.split(parallelism, pipelineOptions); }
@Override public void prepare() { try { reader = unboundedSource.createReader(null, null); } catch (final Exception e) { throw new RuntimeException(e); } }