public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testCompressedWriteAndReadASingleFile() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.deflateCodec(9))); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("deflate", dataFileStream.getMetaString("avro.codec")); } }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testWriteThenReadASingleFileWithNullCodec() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.nullCodec())); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("null", dataFileStream.getMetaString("avro.codec")); } }
readPipeline.apply( "read_" + prefix, AvroIO.readGenericRecords(schemaFromPrefix(prefix)).from(expectedFilepattern)); PAssert.that(records).containsInAnyOrder(expectedElements.get(prefix));
"Read", AvroIO.read(GenericClass.class) .from(tmpFolder.getRoot().getAbsolutePath() + "/first*") .watchForNewFiles( Duration.millis(100),
PAssert.that( readPipeline.apply( "Read", AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); PAssert.that( "Read withHintMatchesManyFiles", AvroIO.read(GenericClass.class) .from(outputFile.getAbsolutePath()) .withHintMatchesManyFiles())) .containsInAnyOrder(values);
"Read", AvroIO.read(GenericClass.class) .from(readPipeline.newProvider(outputFile.getAbsolutePath()))) .apply( MapElements.via(
/** * Tests that {@code AvroIO} can read an upgraded version of an old class, as long as the schema * resolution process succeeds. This test covers the case when a new, {@code @Nullable} field has * been added. * * <p>For more information, see http://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution */ @Test @Category(NeedsRunner.class) public void testWriteThenReadSchemaUpgrade() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding()); writePipeline.run(); List<GenericClassV2> expected = ImmutableList.of(new GenericClassV2(3, "hi", null), new GenericClassV2(5, "bar", null)); PAssert.that( readPipeline.apply( AvroIO.read(GenericClassV2.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(expected); readPipeline.run(); }
@Test @Category(NeedsRunner.class) public void testWriteThenReadJavaClass() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(writePipeline.newProvider(outputFile.getAbsolutePath())) .withoutSharding()); writePipeline.run(); PAssert.that( readPipeline.apply( "Read", AvroIO.read(GenericClass.class) .from(readPipeline.newProvider(outputFile.getAbsolutePath())))) .containsInAnyOrder(values); readPipeline.run(); }
private <T extends GenericRecord> void testWriteThenReadGeneratedClass( AvroIO.Write<T> writeTransform, AvroIO.Read<T> readTransform) throws Exception { File outputFile = tmpFolder.newFile("output.avro"); List<T> values = ImmutableList.of( (T) new AvroGeneratedUser("Bob", 256, null), (T) new AvroGeneratedUser("Alice", 128, null), (T) new AvroGeneratedUser("Ted", null, "white")); writePipeline .apply(Create.of(values)) .apply( writeTransform .to(writePipeline.newProvider(outputFile.getAbsolutePath())) .withoutSharding()); writePipeline.run(); PAssert.that( readPipeline.apply( "Read", readTransform.from(readPipeline.newProvider(outputFile.getAbsolutePath())))) .containsInAnyOrder(values); readPipeline.run(); }
/** Like {@link #from(ValueProvider)}. */ public Read<T> from(String filepattern) { return from(StaticValueProvider.of(filepattern)); }
public static PipelineResult run(Options options) { Pipeline pipeline = Pipeline.create(options); BigtableIO.Write write = BigtableIO.write() .withProjectId(options.getBigtableProjectId()) .withInstanceId(options.getBigtableInstanceId()) .withTableId(options.getBigtableTableId()); pipeline .apply("Read from Avro", AvroIO.read(BigtableRow.class).from(options.getInputFilePattern())) .apply("Transform to Bigtable", MapElements.via(new AvroToBigtableFn())) .apply("Write to Bigtable", write); return pipeline.run(); }
@Test public void testAvroIOGetName() { assertEquals("AvroIO.Read", AvroIO.read(String.class).from("/tmp/foo*/baz").getName()); assertEquals("AvroIO.Write", AvroIO.write(String.class).to("/tmp/foo/baz").getName()); }
@Test @Category(ValidatesRunner.class) public void testPrimitiveReadDisplayData() { DisplayDataEvaluator evaluator = DisplayDataEvaluator.create(); AvroIO.Read<GenericRecord> read = AvroIO.readGenericRecords(Schema.create(Schema.Type.STRING)).from("/foo.*"); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read); assertThat( "AvroIO.Read should include the file pattern in its primitive transform", displayData, hasItem(hasDisplayItem("filePattern"))); }
@Test public void testReadDisplayData() { AvroIO.Read<String> read = AvroIO.read(String.class).from("/foo.*"); DisplayData displayData = DisplayData.from(read); assertThat(displayData, hasDisplayItem("filePattern", "/foo.*")); }