public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testWriteThenReadASingleFileWithNullCodec() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.nullCodec())); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("null", dataFileStream.getMetaString("avro.codec")); } }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testCompressedWriteAndReadASingleFile() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.deflateCodec(9))); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("deflate", dataFileStream.getMetaString("avro.codec")); } }
@Test public void testWriteDisplayData() { AvroIO.Write<GenericClass> write = AvroIO.write(GenericClass.class) .to("/foo") .withShardNameTemplate("-SS-of-NN-") .withSuffix("bar") .withNumShards(100) .withCodec(CodecFactory.deflateCodec(6)); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("filePrefix", "/foo")); assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); assertThat( displayData, hasDisplayItem( "schema", "{\"type\":\"record\",\"name\":\"GenericClass\",\"namespace\":\"org.apache.beam.sdk.io" + ".AvroIOTest$\",\"fields\":[{\"name\":\"intField\",\"type\":\"int\"}," + "{\"name\":\"stringField\",\"type\":\"string\"}]}")); assertThat(displayData, hasDisplayItem("numShards", 100)); assertThat(displayData, hasDisplayItem("codec", CodecFactory.deflateCodec(6).toString())); } }
@Test @SuppressWarnings("unchecked") public void testWriteWithSerDeCustomDeflateCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.deflateCodec(9)); assertEquals( CodecFactory.deflateCodec(9).toString(), SerializableUtils.clone(write.inner.getCodec()).getCodec().toString()); }
@Test @SuppressWarnings("unchecked") public void testWriteWithSerDeCustomXZCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.xzCodec(9)); assertEquals( CodecFactory.xzCodec(9).toString(), SerializableUtils.clone(write.inner.getCodec()).getCodec().toString()); }
@Test public void testWriteWithCustomCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.snappyCodec()); assertEquals(SNAPPY_CODEC, write.inner.getCodec().toString()); }