public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@SuppressWarnings("deprecation") // using AvroCoder#createDatumReader for tests. private void runTestWrite(String[] expectedElements, int numShards) throws IOException { File baseOutputFile = new File(tmpFolder.getRoot(), "prefix"); String outputFilePrefix = baseOutputFile.getAbsolutePath(); AvroIO.Write<String> write = AvroIO.write(String.class).to(outputFilePrefix).withSuffix(".avro"); if (numShards > 1) { write = write.withNumShards(numShards); } else { write = write.withoutSharding(); } writePipeline.apply(Create.of(ImmutableList.copyOf(expectedElements))).apply(write); writePipeline.run(); String shardNameTemplate = firstNonNull( write.inner.getShardTemplate(), DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE); assertTestOutputs(expectedElements, numShards, outputFilePrefix, shardNameTemplate); }
public static PipelineResult run(Options options) { Pipeline pipeline = Pipeline.create(options); BigtableIO.Read read = BigtableIO.read() .withProjectId(options.getBigtableProjectId()) .withInstanceId(options.getBigtableInstanceId()) .withTableId(options.getBigtableTableId()); // Do not validate input fields if it is running as a template. if (options.as(DataflowPipelineOptions.class).getTemplateLocation() != null) { read = read.withoutValidation(); } ValueProvider<String> filePathPrefix = DualInputNestedValueProvider.of( options.getOutputDirectory(), options.getFilenamePrefix(), new SerializableFunction<TranslatorInput<String, String>, String>() { @Override public String apply(TranslatorInput<String, String> input) { return new StringBuilder(input.getX()).append(input.getY()).toString(); } }); pipeline .apply("Read from Bigtable", read) .apply("Transform to Avro", MapElements.via(new BigtableToAvroFn())) .apply( "Write to Avro in GCS", AvroIO.write(BigtableRow.class).to(filePathPrefix).withSuffix(".avro")); return pipeline.run(); }
@Test public void testWriteDisplayData() { AvroIO.Write<GenericClass> write = AvroIO.write(GenericClass.class) .to("/foo") .withShardNameTemplate("-SS-of-NN-") .withSuffix("bar") .withNumShards(100) .withCodec(CodecFactory.deflateCodec(6)); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("filePrefix", "/foo")); assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); assertThat( displayData, hasDisplayItem( "schema", "{\"type\":\"record\",\"name\":\"GenericClass\",\"namespace\":\"org.apache.beam.sdk.io" + ".AvroIOTest$\",\"fields\":[{\"name\":\"intField\",\"type\":\"int\"}," + "{\"name\":\"stringField\",\"type\":\"string\"}]}")); assertThat(displayData, hasDisplayItem("numShards", 100)); assertThat(displayData, hasDisplayItem("codec", CodecFactory.deflateCodec(6).toString())); } }