/** * Creates a {@code Read} transform that will read from an {@code HDFSFileSource} with the given file name or * pattern ("glob") using the given Hadoop {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, with * key-value types specified by the given key class and value class. */ public static <K, V, T extends FileInputFormat<K, V>> Read.Bounded<KV<K, V>> readFrom(String filepattern, Class<T> formatClass, Class<K> keyClass, Class<V> valueClass) { return Read.from(from(filepattern, formatClass, keyClass, valueClass)); }
@Override public PCollection<T> expand(PBegin input) { getConfiguration().validate(); return input.apply(org.apache.beam.sdk.io.Read.from(createSource())); } }
@Override public PCollection<String> expand(PBegin input) { ConnectionConfiguration connectionConfiguration = getConnectionConfiguration(); checkState(connectionConfiguration != null, "withConnectionConfiguration() is required"); return input.apply( org.apache.beam.sdk.io.Read.from(new BoundedElasticsearchSource(this, null, null, null))); }
@Override public PCollection<HCatRecord> expand(PBegin input) { checkArgument(getTable() != null, "withTable() is required"); checkArgument(getConfigProperties() != null, "withConfigProperties() is required"); return input.apply(org.apache.beam.sdk.io.Read.from(new BoundedHCatalogSource(this))); }
@Test public void failsWhenCustomUnboundedSourceIsNotSerializable() { thrown.expect(IllegalArgumentException.class); Read.from(new NotSerializableUnboundedSource()); }
@Test public void failsWhenCustomBoundedSourceIsNotSerializable() { thrown.expect(IllegalArgumentException.class); Read.from(new NotSerializableBoundedSource()); }
@Override public PCollection<Row> expand(PBegin input) { getBigtableConfig().validate(); BigtableSource source = new BigtableSource(getBigtableConfig(), getRowFilter(), getKeyRanges(), null); return input.getPipeline().apply(org.apache.beam.sdk.io.Read.from(source)); }
@Before public void setup() { source = CountingSource.unboundedWithTimestampFn(new LongToInstantFn()); longs = p.apply(Read.from(source)); options = PipelineOptionsFactory.create(); context = mock(EvaluationContext.class); factory = new UnboundedReadEvaluatorFactory(context, options); output = bundleFactory.createBundle(longs); graph = DirectGraphs.getGraph(p); when(context.createBundle(longs)).thenReturn(output); }
@Before public void setup() { MockitoAnnotations.initMocks(this); source = CountingSource.upTo(10L); longs = p.apply(Read.from(source)); options = PipelineOptionsFactory.create(); factory = new BoundedReadEvaluatorFactory( context, options, Long.MAX_VALUE /* minimum size for dynamic splits */); bundleFactory = ImmutableListBundleFactory.create(); longsProducer = DirectGraphs.getProducer(longs); }
@Test public void matcherProducesUnconsumedValueUnboundedRead() { Unbounded<Long> transform = Read.from(CountingSource.unbounded()); pipeline.apply(transform); UnconsumedReads.ensureAllReadsConsumed(pipeline); validateConsumed(); }
@Test public void matcherProducesUnconsumedValueBoundedRead() { Bounded<Long> transform = Read.from(CountingSource.upTo(20L)); pipeline.apply(transform); UnconsumedReads.ensureAllReadsConsumed(pipeline); validateConsumed(); }
@Test public void testForwardsDisplayData() { TestCountingSource src = new TestCountingSource(1234) { @Override public void populateDisplayData(DisplayData.Builder builder) { builder.add(DisplayData.item("foo", "bar")); } }; BoundedReadFromUnboundedSource<KV<Integer, Integer>> read = Read.from(src).withMaxNumRecords(5); assertThat(DisplayData.from(read), includesDisplayDataFor("source", src)); }
@Override public PCollection<IndexedRecord> read(PBegin in) { LazyAvroCoder<IndexedRecord> lac = LazyAvroCoder.of(); ExcelHdfsFileSource source = ExcelHdfsFileSource.of(doAs, path, lac, limit, encoding, sheetName, header, footer, excelFormat.name()); source.getExtraHadoopConfiguration().addFrom(getExtraHadoopConfiguration()); source.setLimit(limit); PCollection<KV<Void, IndexedRecord>> pc1 = in.apply(Read.from(source)).setCoder(source.getDefaultOutputCoder()); PCollection<IndexedRecord> pc2 = pc1.apply(Values.<IndexedRecord>create()); return pc2; }
public static void main(String[] args) { Pipeline p = initializePipeline(args); KafkaOptions options = getOptions(p); FlinkKafkaConsumer08<MyType> kafkaConsumer = new FlinkKafkaConsumer08<>(options.getKafkaAvroTopic(), new AvroSerializationDeserializationSchema<>(MyType.class), getKafkaProps(options)); p .apply(Read.from(UnboundedFlinkSource.of(kafkaConsumer))) .setCoder(AvroCoder.of(MyType.class)) .apply(ParDo.of(new PrintFn<>())); p.run(); }
@Test @Category(NeedsRunner.class) public void testBoundedSource() { long numElements = 1000; PCollection<Long> input = p.apply(Read.from(CountingSource.upTo(numElements))); addCountingAsserts(input, numElements); p.run(); }
@Test @Category(NeedsRunner.class) public void testDataflowFile() throws IOException { List<String> data = createStringDataset(3, 50); String fileName = "file"; File file = createFileWithData(fileName, data); TestFileBasedSource source = new TestFileBasedSource(file.getPath(), 64, null); PCollection<String> output = p.apply("ReadFileData", Read.from(source)); PAssert.that(output).containsInAnyOrder(data); p.run(); }
@Test @Category(NeedsRunner.class) public void testEmptyBoundedSource() { PCollection<Long> input = p.apply(Read.from(CountingSource.upTo(0))); PAssert.that(input).empty(); p.run(); }
@Test @Category(NeedsRunner.class) public void testUnboundedSource() { long numElements = 1000; PCollection<Long> input = p.apply(Read.from(CountingSource.unbounded()).withMaxNumRecords(numElements)); addCountingAsserts(input, numElements); p.run(); }
@Test public void splitsInputs() { Pipeline p = getPipeline(); PCollection<Long> longs = p.apply(Read.from(MustSplitSource.of(CountingSource.upTo(3)))); PAssert.that(longs).containsInAnyOrder(0L, 1L, 2L); p.run(); }
@Test public void retainOnlyPrimitivesWithOnlyPrimitivesUnchanged() { Pipeline p = Pipeline.create(); p.apply("Read", Read.from(CountingSource.unbounded())) .apply( "multi-do", ParDo.of(new TestFn()).withOutputTags(new TupleTag<>(), TupleTagList.empty())); Components originalComponents = PipelineTranslation.toProto(p).getComponents(); Collection<String> primitiveComponents = QueryablePipeline.getPrimitiveTransformIds(originalComponents); assertThat(primitiveComponents, equalTo(originalComponents.getTransformsMap().keySet())); }