@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testWriteThenReadASingleFileWithNullCodec() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.nullCodec())); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("null", dataFileStream.getMetaString("avro.codec")); } }
readPipeline.apply( "read_" + prefix, AvroIO.readGenericRecords(schemaFromPrefix(prefix)).from(expectedFilepattern)); PAssert.that(records).containsInAnyOrder(expectedElements.get(prefix));
"Read", AvroIO.read(GenericClass.class) .from(tmpFolder.getRoot().getAbsolutePath() + "/first*") .watchForNewFiles( Duration.millis(100), Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))))
PAssert.that( readPipeline.apply( "Read", AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); PAssert.that( "Read withHintMatchesManyFiles", AvroIO.read(GenericClass.class) .from(outputFile.getAbsolutePath()) .withHintMatchesManyFiles())) .containsInAnyOrder(values); PAssert.that(
"Read", AvroIO.read(GenericClass.class) .from(readPipeline.newProvider(outputFile.getAbsolutePath()))) .apply( MapElements.via(
/** * Tests that {@code AvroIO} can read an upgraded version of an old class, as long as the schema * resolution process succeeds. This test covers the case when a new, {@code @Nullable} field has * been added. * * <p>For more information, see http://avro.apache.org/docs/1.7.7/spec.html#Schema+Resolution */ @Test @Category(NeedsRunner.class) public void testWriteThenReadSchemaUpgrade() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply(AvroIO.write(GenericClass.class).to(outputFile.getAbsolutePath()).withoutSharding()); writePipeline.run(); List<GenericClassV2> expected = ImmutableList.of(new GenericClassV2(3, "hi", null), new GenericClassV2(5, "bar", null)); PAssert.that( readPipeline.apply( AvroIO.read(GenericClassV2.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(expected); readPipeline.run(); }
@Test @SuppressWarnings("unchecked") @Category(NeedsRunner.class) public void testCompressedWriteAndReadASingleFile() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(outputFile.getAbsolutePath()) .withoutSharding() .withCodec(CodecFactory.deflateCodec(9))); writePipeline.run(); PAssert.that( readPipeline.apply(AvroIO.read(GenericClass.class).from(outputFile.getAbsolutePath()))) .containsInAnyOrder(values); readPipeline.run(); try (DataFileStream dataFileStream = new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) { assertEquals("deflate", dataFileStream.getMetaString("avro.codec")); } }
@Test @Category(NeedsRunner.class) public void testWriteThenReadJavaClass() throws Throwable { List<GenericClass> values = ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar")); File outputFile = tmpFolder.newFile("output.avro"); writePipeline .apply(Create.of(values)) .apply( AvroIO.write(GenericClass.class) .to(writePipeline.newProvider(outputFile.getAbsolutePath())) .withoutSharding()); writePipeline.run(); PAssert.that( readPipeline.apply( "Read", AvroIO.read(GenericClass.class) .from(readPipeline.newProvider(outputFile.getAbsolutePath())))) .containsInAnyOrder(values); readPipeline.run(); }
@Override public PCollection<T> expand(PBegin input) { checkNotNull(getFilepattern(), "filepattern"); checkNotNull(getSchema(), "schema"); if (getMatchConfiguration().getWatchInterval() == null && !getHintMatchesManyFiles()) { return input.apply( "Read", org.apache.beam.sdk.io.Read.from( createSource( getFilepattern(), getMatchConfiguration().getEmptyMatchTreatment(), getRecordClass(), getSchema()))); } // All other cases go through ReadAll. ReadAll<T> readAll = (getRecordClass() == GenericRecord.class) ? (ReadAll<T>) readAllGenericRecords(getSchema()) : readAll(getRecordClass()); readAll = readAll.withMatchConfiguration(getMatchConfiguration()); return input .apply("Create filepattern", Create.ofProvider(getFilepattern(), StringUtf8Coder.of())) .apply("Via ReadAll", readAll); }
private <T extends GenericRecord> void testWriteThenReadGeneratedClass( AvroIO.Write<T> writeTransform, AvroIO.Read<T> readTransform) throws Exception { File outputFile = tmpFolder.newFile("output.avro"); List<T> values = ImmutableList.of( (T) new AvroGeneratedUser("Bob", 256, null), (T) new AvroGeneratedUser("Alice", 128, null), (T) new AvroGeneratedUser("Ted", null, "white")); writePipeline .apply(Create.of(values)) .apply( writeTransform .to(writePipeline.newProvider(outputFile.getAbsolutePath())) .withoutSharding()); writePipeline.run(); PAssert.that( readPipeline.apply( "Read", readTransform.from(readPipeline.newProvider(outputFile.getAbsolutePath())))) .containsInAnyOrder(values); readPipeline.run(); }
/** Like {@link #from(ValueProvider)}. */ public Read<T> from(String filepattern) { return from(StaticValueProvider.of(filepattern)); }
/** * Continuously watches for new files matching the filepattern, polling it at the given * interval, until the given termination condition is reached. The returned {@link PCollection} * is unbounded. * * <p>This works only in runners supporting {@link Kind#SPLITTABLE_DO_FN}. */ @Experimental(Kind.SPLITTABLE_DO_FN) public Read<T> watchForNewFiles( Duration pollInterval, TerminationCondition<String, ?> terminationCondition) { return withMatchConfiguration( getMatchConfiguration().continuously(pollInterval, terminationCondition)); }
/** Configures whether or not a filepattern matching no files is allowed. */ public Read<T> withEmptyMatchTreatment(EmptyMatchTreatment treatment) { return withMatchConfiguration(getMatchConfiguration().withEmptyMatchTreatment(treatment)); }
@Override public void populateDisplayData(DisplayData.Builder builder) { super.populateDisplayData(builder); builder .addIfNotNull( DisplayData.item("filePattern", getFilepattern()).withLabel("Input File Pattern")) .include("matchConfiguration", getMatchConfiguration()); }
public static PipelineResult run(Options options) { Pipeline pipeline = Pipeline.create(options); BigtableIO.Write write = BigtableIO.write() .withProjectId(options.getBigtableProjectId()) .withInstanceId(options.getBigtableInstanceId()) .withTableId(options.getBigtableTableId()); pipeline .apply("Read from Avro", AvroIO.read(BigtableRow.class).from(options.getInputFilePattern())) .apply("Transform to Bigtable", MapElements.via(new AvroToBigtableFn())) .apply("Write to Bigtable", write); return pipeline.run(); }
@Test @Category(ValidatesRunner.class) public void testPrimitiveReadDisplayData() { DisplayDataEvaluator evaluator = DisplayDataEvaluator.create(); AvroIO.Read<GenericRecord> read = AvroIO.readGenericRecords(Schema.create(Schema.Type.STRING)).from("/foo.*"); Set<DisplayData> displayData = evaluator.displayDataForPrimitiveSourceTransforms(read); assertThat( "AvroIO.Read should include the file pattern in its primitive transform", displayData, hasItem(hasDisplayItem("filePattern"))); }
@Test public void testReadDisplayData() { AvroIO.Read<String> read = AvroIO.read(String.class).from("/foo.*"); DisplayData displayData = DisplayData.from(read); assertThat(displayData, hasDisplayItem("filePattern", "/foo.*")); }
public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }