public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
write = AvroIO.write(GenericClass.class) .to(policy) .withTempDirectory( StaticValueProvider.of(
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testMetadata() throws Exception { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withMetadata( ImmutableMap.of( "stringKey", "stringValue", "longKey", 100L, "bytesKey", "bytesValue".getBytes(Charsets.UTF_8)))); writePipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("stringValue", dataFileStream.getMetaString("stringKey")); assertEquals(100L, dataFileStream.getMetaLong("longKey")); assertArrayEquals("bytesValue".getBytes(Charsets.UTF_8), dataFileStream.getMeta("bytesKey")); } }
.apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding()); writePipeline.run();
"Write File(s)", AvroIO.write(AvroPubsubMessageRecord.class) .to( new WindowedFilenamePolicy( options.getOutputDirectory(),
public static PipelineResult run(Options options) { Pipeline pipeline = Pipeline.create(options); BigtableIO.Read read = BigtableIO.read() .withProjectId(options.getBigtableProjectId()) .withInstanceId(options.getBigtableInstanceId()) .withTableId(options.getBigtableTableId()); // Do not validate input fields if it is running as a template. if (options.as(DataflowPipelineOptions.class).getTemplateLocation() != null) { read = read.withoutValidation(); } ValueProvider<String> filePathPrefix = DualInputNestedValueProvider.of( options.getOutputDirectory(), options.getFilenamePrefix(), new SerializableFunction<TranslatorInput<String, String>, String>() { @Override public String apply(TranslatorInput<String, String> input) { return new StringBuilder(input.getX()).append(input.getY()).toString(); } }); pipeline .apply("Read from Bigtable", read) .apply("Transform to Avro", MapElements.via(new BigtableToAvroFn())) .apply( "Write to Avro in GCS", AvroIO.write(BigtableRow.class).to(filePathPrefix).withSuffix(".avro")); return pipeline.run(); }
/** * Tests that {@code AvroIO} can read an upgraded version of an old class, as long as the schema * resolution process succeeds. This test covers the case when a new, {@code @Nullable} field has * been added. * * <p>For more information, see http://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution */ @Test @Category(NeedsRunner.class) public void testWriteThenReadSchemaUpgrade() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding()); writePipeline.run(); List<GenericClassV2> expected = ImmutableList.of(new GenericClassV2(3, "hi", null), new GenericClassV2(5, "bar", null)); PAssert.that( readPipeline.apply( AvroIO.read(GenericClassV2.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(expected); readPipeline.run(); }
@Test public void testWriteDisplayData() { AvroIO.Write<GenericClass> write = AvroIO.write(GenericClass.class) .to("/foo") .withShardNameTemplate("-SS-of-NN-") .withSuffix("bar") .withNumShards(100) .withCodec(CodecFactory.deflateCodec(6)); DisplayData displayData = DisplayData.from(write); assertThat(displayData, hasDisplayItem("filePrefix", "/foo")); assertThat(displayData, hasDisplayItem("shardNameTemplate", "-SS-of-NN-")); assertThat(displayData, hasDisplayItem("fileSuffix", "bar")); assertThat( displayData, hasDisplayItem( "schema", "{\"type\":\"record\",\"name\":\"GenericClass\",\"namespace\":\"org.apache.beam.sdk.io" + ".AvroIOTest$\",\"fields\":[{\"name\":\"intField\",\"type\":\"int\"}," + "{\"name\":\"stringField\",\"type\":\"string\"}]}")); assertThat(displayData, hasDisplayItem("numShards", 100)); assertThat(displayData, hasDisplayItem("codec", CodecFactory.deflateCodec(6).toString())); } }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testCompressedWriteAndReadASingleFile() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.deflateCodec(9))); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("deflate", dataFileStream.getMetaString("avro.codec")); } }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testWriteThenReadASingleFileWithNullCodec() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.nullCodec())); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("null", dataFileStream.getMetaString("avro.codec")); } }
@Test @Category(NeedsRunner.class) public void testWriteThenReadJavaClass() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(writePipeline.newProvider(outputFile.getAbsolutePath())) .withoutSharding()); writePipeline.run(); PAssert.that( readPipeline.apply( "Read", AvroIO.read(GenericClass.class) .from(readPipeline.newProvider(outputFile.getAbsolutePath())))) .containsInAnyOrder(values); readPipeline.run(); }
@SuppressWarnings("deprecation") // using AvroCoder#createDatumReader for tests. private void runTestWrite(String[] expectedElements, int numShards) throws IOException { File baseOutputFile = new File(tmpFolder.getRoot(), "prefix"); String outputFilePrefix = baseOutputFile.getAbsolutePath(); AvroIO.Write<String> write = AvroIO.write(String.class).to(outputFilePrefix).withSuffix(".avro"); if (numShards > 1) { write = write.withNumShards(numShards); } else { write = write.withoutSharding(); } writePipeline.apply(Create.of(ImmutableList.copyOf(expectedElements))).apply(write); writePipeline.run(); String shardNameTemplate = firstNonNull( write.inner.getShardTemplate(), DefaultFilenamePolicy.DEFAULT_UNWINDOWED_SHARD_TEMPLATE); assertTestOutputs(expectedElements, numShards, outputFilePrefix, shardNameTemplate); }
private <T extends GenericRecord> void testWriteThenReadGeneratedClass( AvroIO.Write<T> writeTransform, AvroIO.Read<T> readTransform) throws Exception { File outputFile = tmpFolder.newFile("output.avro"); List<T> values = ImmutableList.of( (T) new AvroGeneratedUser("Bob", 256, null), (T) new AvroGeneratedUser("Alice", 128, null), (T) new AvroGeneratedUser("Ted", null, "white")); writePipeline .apply(Create.of(values)) .apply( writeTransform .to(writePipeline.newProvider(outputFile.getAbsolutePath())) .withoutSharding()); writePipeline.run(); PAssert.that( readPipeline.apply( "Read", readTransform.from(readPipeline.newProvider(outputFile.getAbsolutePath())))) .containsInAnyOrder(values); readPipeline.run(); }
@Test @SuppressWarnings("unchecked") public void testWriteWithSerDeCustomDeflateCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.deflateCodec(9)); assertEquals( CodecFactory.deflateCodec(9).toString(), SerializableUtils.clone(write.inner.getCodec()).getCodec().toString()); }
@Test @SuppressWarnings("unchecked") public void testWriteWithSerDeCustomXZCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.xzCodec(9)); assertEquals( CodecFactory.xzCodec(9).toString(), SerializableUtils.clone(write.inner.getCodec()).getCodec().toString()); }
@Test public void testWriteWithCustomCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz").withCodec(CodecFactory.snappyCodec()); assertEquals(SNAPPY_CODEC, write.inner.getCodec().toString()); }
@Test public void testWriteWithDefaultCodec() throws Exception { AvroIO.Write<String> write = AvroIO.write(String.class).to("/tmp/foo/baz"); assertEquals(CodecFactory.snappyCodec().toString(), write.inner.getCodec().toString()); }
@Test public void testAvroIOGetName() { assertEquals("AvroIO.Read", AvroIO.read(String.class).from("/tmp/foo*/baz").getName()); assertEquals("AvroIO.Write", AvroIO.write(String.class).to("/tmp/foo/baz").getName()); }