public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
static void runTfIdf(Options options) throws Exception { Pipeline pipeline = Pipeline.create(options); pipeline.getCoderRegistry().registerCoderForClass(URI.class, StringDelegateCoder.of(URI.class)); pipeline .apply(new ReadDocuments(listInputDocuments(options))) .apply(new ComputeTfIdf()) .apply(new WriteTfIdf(options.getOutput())); pipeline.run().waitUntilFinish(); }
/** * Translates the pipeline by passing this class as a visitor. * @param pipeline The pipeline to be translated */ public void translate(Pipeline pipeline) { pipeline.traverseTopologically(this); }
@Override public void finishSpecifyingOutput( String transformName, PInput input, PTransform<?, ?> transform) { this.coderOrFailure = inferCoderOrFail( input, transform, getPipeline().getCoderRegistry(), getPipeline().getSchemaRegistry()); super.finishSpecifyingOutput(transformName, input, transform); }
@Test public void testValueProviderTopic() { StaticValueProvider<String> provider = StaticValueProvider.of("projects/project/topics/topic"); Read<String> pubsubRead = PubsubIO.readStrings().fromTopic(provider); Pipeline.create().apply(pubsubRead); assertThat(pubsubRead.getTopicProvider(), not(nullValue())); assertThat(pubsubRead.getTopicProvider().isAccessible(), is(true)); assertThat(pubsubRead.getTopicProvider().get().asPath(), equalTo(provider.get())); }
@Test public void getAdditionalInputsDelegates() { Map<TupleTag<?>, PValue> additionalInputs = ImmutableMap.of(new TupleTag<>("test_tag"), Pipeline.create().apply(Create.of("1"))); when(delegate.getAdditionalInputs()).thenReturn(additionalInputs); assertThat(forwarding.getAdditionalInputs(), equalTo(additionalInputs)); }
private void verifyMergingStatefulParDoRejected(PipelineOptions options) throws Exception { Pipeline p = Pipeline.create(options); p.apply(Create.of(KV.of(13, 42))) .apply(Window.into(Sessions.withGapDuration(Duration.millis(1)))) .apply( ParDo.of( new DoFn<KV<Integer, Integer>, Void>() { @StateId("fizzle") private final StateSpec<ValueState<Void>> voidState = StateSpecs.value(); @ProcessElement public void process() {} })); thrown.expectMessage("merging"); thrown.expect(UnsupportedOperationException.class); p.run(); }
@Test public void rootTransforms() { Pipeline p = Pipeline.create(); p.apply("UnboundedRead", Read.from(CountingSource.unbounded())) .apply(Window.into(FixedWindows.of(Duration.millis(5L)))) .apply(Count.perElement()); p.apply("BoundedRead", Read.from(CountingSource.upTo(100L))); Components components = PipelineTranslation.toProto(p).getComponents(); QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components); assertThat(qp.getRootTransforms(), hasSize(2)); for (PTransformNode rootTransform : qp.getRootTransforms()) { assertThat( "Root transforms should have no inputs", rootTransform.getTransform().getInputsCount(), equalTo(0)); assertThat( "Only added source reads to the pipeline", rootTransform.getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.READ_TRANSFORM_URN)); } }
@Test public void testTranslationModeOverrideWithUnboundedSources() { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(FlinkRunner.class); options.setStreaming(false); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(GenerateSequence.from(0)); flinkEnv.translate(pipeline); assertThat(options.isStreaming(), is(true)); }
@Test public void testTranslationModeNoOverrideWithoutUnboundedSources() { boolean[] testArgs = new boolean[] {true, false}; for (boolean streaming : testArgs) { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(FlinkRunner.class); options.setStreaming(streaming); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); Pipeline pipeline = Pipeline.create(options); flinkEnv.translate(pipeline); assertThat(options.isStreaming(), is(streaming)); } } }
@Test public void getRootTransformsSucceeds() { Pipeline pipeline = Pipeline.create(); pipeline.apply("impulse", Impulse.create()); pipeline.apply("otherImpulse", Impulse.create()); PortableGraph graph = PortableGraph.forPipeline(PipelineTranslation.toProto(pipeline)); assertThat(graph.getRootTransforms(), hasSize(2)); assertThat( graph.getRootTransforms().stream().map(PTransformNode::getId).collect(Collectors.toSet()), containsInAnyOrder("impulse", "otherImpulse")); }
@Test public void testCheckingForSuccessSkipsNonTentativeMetrics() throws Exception { DataflowPipelineJob job = spy(new DataflowPipelineJob(mockClient, "test-job", options, null)); Pipeline p = TestPipeline.create(options); PCollection<Integer> pc = p.apply(Create.of(1, 2, 3)); PAssert.that(pc).containsInAnyOrder(1, 2, 3); when(mockClient.getJobMetrics(anyString())) .thenReturn( buildJobMetrics(generateMockMetrics(true /* success */, false /* tentative */))); TestDataflowRunner runner = TestDataflowRunner.fromOptionsAndClient(options, mockClient); runner.updatePAssertCount(p); doReturn(State.RUNNING).when(job).getState(); assertThat(runner.checkForPAssertSuccess(job), equalTo(Optional.<Boolean>absent())); }
private void testViewNonmerging( Pipeline pipeline, PTransform<PCollection<KV<String, Integer>>, ? extends PCollectionView<?>> view) { thrown.expect(IllegalStateException.class); thrown.expectMessage("Unable to create a side-input view from input"); thrown.expectCause( ThrowableMessageMatcher.hasMessage(Matchers.containsString("Consumed by GroupByKey"))); pipeline .apply(Create.of(KV.of("hello", 5))) .apply( Window.into( new InvalidWindows<>( "Consumed by GroupByKey", FixedWindows.of(Duration.standardHours(1))))) .apply(view); }
@Test public void testTransformTranslatorMissing() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline p = Pipeline.create(options); p.apply(Create.of(Arrays.asList(1, 2, 3))).apply(new TestTransform()); thrown.expect(IllegalStateException.class); thrown.expectMessage(containsString("no translator registered")); DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()); ArgumentCaptor<Job> jobCaptor = ArgumentCaptor.forClass(Job.class); Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture()); assertValidJob(jobCaptor.getValue()); }
@Before public void setUp() { PCollection<String> pc = Pipeline.create().apply(Create.of("1")); view1 = pc.apply(Window.into(FixedWindows.of(new Duration(WINDOW_MSECS_1)))) .apply(View.asIterable()); view2 = pc.apply(Window.into(FixedWindows.of(new Duration(WINDOW_MSECS_2)))) .apply(View.asIterable()); }
public final static void main(String[] args) throws Exception { PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create(); Pipeline pipeline = Pipeline.create(options); pipeline .apply(MqttIO.read().withConnectionConfiguration(MqttIO.ConnectionConfiguration.create("tcp://localhost:1883", "BEAM", "BEAM")).withMaxNumRecords(5)) .apply(ParDo.of(new DoFn<byte[], String>() { @ProcessElement public void processElement(ProcessContext processContext) { byte[] element = processContext.element(); processContext.output(new String(element)); } })) .apply(Window.<String>into(FixedWindows.of(Duration.standardSeconds(30)))) .apply(TextIO.write() .to("hdfs://localhost/uc2") .withWindowedWrites() .withNumShards(1)); pipeline.run(); } }
@Test public void testExpandWithDuplicates() { Pipeline p = TestPipeline.create(); PCollection<Long> createOne = p.apply("CreateOne", Create.of(1L, 2L, 3L)); PCollectionList<Long> list = PCollectionList.of(createOne).and(createOne).and(createOne); assertThat(list.expand().values(), containsInAnyOrder(createOne, createOne, createOne)); }
@Test public void shouldRecognizeAndTranslateStreamingPipeline() { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("[auto]"); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); Pipeline pipeline = Pipeline.create(); pipeline .apply(GenerateSequence.from(0).withRate(1, Duration.standardSeconds(1))) .apply( ParDo.of( new DoFn<Long, String>() { @ProcessElement public void processElement(ProcessContext c) throws Exception { c.output(Long.toString(c.element())); } })) .apply(Window.into(FixedWindows.of(Duration.standardHours(1)))) .apply(TextIO.write().withNumShards(1).withWindowedWrites().to("/dummy/path")); flinkEnv.translate(pipeline); // no exception should be thrown }
@Test public void shouldUseDefaultTempLocationIfNoneSet() { FlinkPipelineOptions options = PipelineOptionsFactory.as(FlinkPipelineOptions.class); options.setRunner(TestFlinkRunner.class); options.setFlinkMaster("clusterAddress"); FlinkPipelineExecutionEnvironment flinkEnv = new FlinkPipelineExecutionEnvironment(options); Pipeline pipeline = Pipeline.create(options); flinkEnv.translate(pipeline); String defaultTmpDir = System.getProperty("java.io.tmpdir"); assertThat(options.getFilesToStage(), hasItem(startsWith(defaultTmpDir))); }