public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile));
@Override public PCollection<Row> buildIOReader(PBegin begin) { return begin .apply("ReadTextFiles", TextIO.read().from(filePattern)) .apply("StringToRow", readConverter); }
@Override public PCollection<String> expand(PBegin begin) { return begin.apply(TextIO.read().from(inputFile)).apply(ParDo.of(new ExtractTimestamps())); } }
@Test public void testSourceTransform() { PTransform<? super PBegin, ? extends POutput> myTransform = TextIO.read().from("foo.*"); DisplayDataEvaluator evaluator = DisplayDataEvaluator.create(); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(myTransform); assertThat(displayData, hasItem(hasDisplayItem("filePattern", "foo.*"))); } }
@Test public void testReadNamed() throws Exception { File emptyFile = tempFolder.newFile(); p.enableAbandonedNodeEnforcement(false); assertEquals("TextIO.Read/Read.out", p.apply(TextIO.read().from("somefile")).getName()); assertEquals( "MyRead/Read.out", p.apply("MyRead", TextIO.read().from(emptyFile.getPath())).getName()); }
@Test public void testRuntimeOptionsNotCalledInApply() throws Exception { p.enableAbandonedNodeEnforcement(false); RuntimeTestOptions options = PipelineOptionsFactory.as(RuntimeTestOptions.class); p.apply(TextIO.read().from(options.getInput())); }
@Test public void testInaccessibleProvider() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline pipeline = Pipeline.create(options); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options); pipeline.apply(TextIO.read().from(new TestValueProvider())); // Check that translation does not fail. t.translate(pipeline, DataflowRunner.fromOptions(options), Collections.emptyList()); }
@Test @Category(ValidatesRunner.class) public void testPrimitiveReadDisplayData() { DisplayDataEvaluator evaluator = DisplayDataEvaluator.create(); TextIO.Read read = TextIO.read().from("foobar"); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read); assertThat( "TextIO.Read should include the file prefix in its primitive display data", displayData, hasItem(hasDisplayItem(hasValue(startsWith("foobar"))))); }
public static void run(Options options) { Pipeline p = Pipeline.create(options); double samplingThreshold = 0.1; p.apply(TextIO.read().from(options.getWikiInput())) .apply(MapElements.via(new ParseTableRowJson())) .apply(new ComputeTopSessions(samplingThreshold)) .apply("Write", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); }
static void runWordCount(WordCountOptions options) { Pipeline p = Pipeline.create(options); // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the // static FormatAsTextFn() to the ParDo transform. p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); }
public static void main(String[] args) { WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation() .as(WordCountOptions.class); Pipeline p = Pipeline.create(options); p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(new CountWords()) .apply(MapElements.via(new FormatAsTextFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run(); } }
private Pipeline buildPipeline(DataflowPipelineOptions options) { options.setRunner(DataflowRunner.class); Pipeline p = Pipeline.create(options); p.apply("ReadMyFile", TextIO.read().from("gs://bucket/object")) .apply("WriteMyFile", TextIO.write().to("gs://bucket/object")); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(p); return p; }
private Pipeline buildDataflowPipeline(DataflowPipelineOptions options) { options.setStableUniqueNames(CheckEnabled.ERROR); options.setRunner(DataflowRunner.class); Pipeline p = Pipeline.create(options); p.apply("ReadMyFile", TextIO.read().from("gs://bucket/object")) .apply("WriteMyFile", TextIO.write().to("gs://bucket/object")); // Enable the FileSystems API to know about gs:// URIs in this test. FileSystems.setDefaultPipelineOptions(options); return p; }
@Test public void testReadDisplayData() { TextIO.Read read = TextIO.read().from("foo.*").withCompression(BZIP2); DisplayData displayData = DisplayData.from(read); assertThat(displayData, hasDisplayItem("filePattern", "foo.*")); assertThat(displayData, hasDisplayItem("compressionType", BZIP2.toString())); }
@Test public void testTextIOWithRuntimeParameters() throws IOException { DataflowPipelineOptions dataflowOptions = buildPipelineOptions(); RuntimeTestOptions options = dataflowOptions.as(RuntimeTestOptions.class); Pipeline p = buildDataflowPipeline(dataflowOptions); p.apply(TextIO.read().from(options.getInput())).apply(TextIO.write().to(options.getOutput())); }
static void runWordCount(WordCountOptions options) { Pipeline p = Pipeline.create(options); p.apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(ParDo.of(new ExtractWordsFn())) .apply(Count.perElement()) .apply(ParDo.of(new FormatAsStringFn())) .apply("WriteCounts", TextIO.write().to(options.getOutput())); p.run().waitUntilFinish(); }