public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
"Write to storage", TextIO.write().to(options.getTextWritePrefix()).withSuffix(".csv"));
/** * Runs a pipeline which reads in Entities from Datastore, passes in the JSON encoded Entities * to a Javascript UDF, and writes the JSON to TextIO sink. * * @param args arguments to the pipeline */ public static void main(String[] args) { DatastoreToTextOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(DatastoreToTextOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(ReadJsonEntities.newBuilder() .setGqlQuery(options.getDatastoreReadGqlQuery()) .setProjectId(options.getDatastoreReadProjectId()) .setNamespace(options.getDatastoreReadNamespace()) .build()) .apply(TransformTextViaJavascript.newBuilder() .setFileSystemPath(options.getJavascriptTextTransformGcsPath()) .setFunctionName(options.getJavascriptTextTransformFunctionName()) .build()) .apply(TextIO.write() .to(options.getTextWritePrefix()) .withSuffix(".json")); pipeline.run(); } }
@Override public PDone expand(PCollection<KV<String, KV<URI, Double>>> wordToUriAndTfIdf) { return wordToUriAndTfIdf .apply( "Format", ParDo.of( new DoFn<KV<String, KV<URI, Double>>, String>() { @ProcessElement public void processElement(ProcessContext c) { c.output( String.format( "%s,\t%s,\t%f", c.element().getKey(), c.element().getValue().getKey(), c.element().getValue().getValue())); } })) .apply(TextIO.write().to(output).withSuffix(".csv")); } }
/** * Runs a pipeline which reads in Entities from datastore, parses the Entity's schema, * and counts the unique number of schemas. * @param args arguments to the pipeline */ public static void main(String[] args) { DatastoreSchemaCountToTextOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(DatastoreSchemaCountToTextOptions.class); Pipeline pipeline = Pipeline.create(options); pipeline .apply(DatastoreReadSchemaCount.newBuilder() .setGqlQuery(options.getDatastoreReadGqlQuery()) .setProjectId(options.getDatastoreReadProjectId()) .setNamespace(options.getDatastoreReadNamespace()) .build()) .apply(TextIO.write() .to(options.getTextWritePrefix()) .withSuffix(".json")); pipeline.run(); } }
@Test public void testWriteDisplayData() { TextIO.Write write = TextIO.write() .to("/foo") .withSuffix("bar") .withShardNameTemplate("-SS-of-NN-") .withNumShards(100) .withFooter("myFooter") .withHeader("myHeader"); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("filePrefix", "/foo")); assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); assertThat(displayData, hasDisplayItem("fileHeader", "myHeader")); assertThat(displayData, hasDisplayItem("fileFooter", "myFooter")); assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); assertThat(displayData, hasDisplayItem("numShards", 100)); assertThat(displayData, hasDisplayItem("writableByteChannelFactory", "UNCOMPRESSED")); }