@ProcessElement public void processElement(ProcessContext c) { String[] columns = c.element().split(DELIMITER); try { Long singerId = Long.parseLong(columns[0].trim()); Long albumId = Long.parseLong(columns[1].trim()); String albumTitle = columns[2].trim(); c.output(new Album(singerId, albumId, albumTitle)); } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { LOG.info("ParseAlbum: parse error on '" + c.element() + "': " + e.getMessage()); } } }
@Override public PCollection<Long> expand(PCollection<Struct> input) { return input.apply(ParDo.of(new EstimateStructSizeFn())); }
@ProcessElement public void processElement( @Element InputT element, OutputReceiver<OutputT> receiver, ProcessContext c) throws Exception { receiver.output( fn.getClosure().apply(element, Fn.Context.wrapProcessContext(c))); }
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
@DoFn.ProcessElement public void processElement(@DoFn.Element InputT element, OutputReceiver<OutputT> outReceiver) { outReceiver.output(transformFn.apply(element)); } }
.apply(MapElements.via(new SimpleFunction<String, MutationGroup>() {
/** Watches the growth of the given poll function. See class documentation for more details. */ public static <InputT, OutputT> Growth<InputT, OutputT, OutputT> growthOf( Growth.PollFn<InputT, OutputT> pollFn, Requirements requirements) { return new AutoValue_Watch_Growth.Builder<InputT, OutputT, OutputT>() .setTerminationPerInput(Growth.never()) .setPollFn(Contextful.of(pollFn, requirements)) // use null as a signal that this is the identity function and output coder can be // reused as key coder .setOutputKeyFn(null) .build(); }
.apply(Sum.longsGlobally()); .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding());
@ProcessElement public void processElement(ProcessContext c) { String[] columns = c.element().split(DELIMITER); try { Long singerId = Long.parseLong(columns[0].trim()); String firstName = columns[1].trim(); String lastName = columns[2].trim(); c.output(new Singer(singerId, firstName, lastName)); } catch (ArrayIndexOutOfBoundsException | NumberFormatException e) { LOG.info("ParseSinger: parse error on '" + c.element() + "': " + e.getMessage()); } } }
public static void runAvroToCsv(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert Avro To CSV pipeline.apply("Read Avro files", AvroIO.readGenericRecords(schemaJson).from(options.getInputFile())) .apply("Convert Avro to CSV formatted data", ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter()))) .apply("Write CSV formatted data", TextIO.write().to(options.getOutput()) .withSuffix(".csv")); // Run the pipeline. pipeline.run().waitUntilFinish(); }
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); // [START spanner_dataflow_read] // Query for all the columns and rows in the specified Spanner table PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withQuery("SELECT * FROM " + options.getTable())); // [END spanner_dataflow_read] PCollection<Long> tableEstimatedSize = records // Estimate the size of every row .apply(EstimateSize.create()) // Sum all the row sizes to get the total estimated size of the table .apply(Sum.longsGlobally()); // Write the total size to a file tableEstimatedSize .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish(); } }
@ProcessElement public void processElement(ProcessContext ctx) { GenericRecord genericRecord = ctx.element(); Schema schema = new Schema.Parser().parse(schemaJson); StringBuilder row = new StringBuilder(); for (Schema.Field field : schema.getFields()) { String fieldType = field.schema().getType().toString().toLowerCase(); if (!acceptedTypes.contains(fieldType)) { LOG.error("Data transformation doesn't support: " + fieldType); throw new IllegalArgumentException("Field type " + fieldType + " is not supported."); } if (row.length() > 0) { row.append(delimiter); } row.append(genericRecord.get(field.name())); } ctx.output(row.toString()); } }
.apply("ParseSingers", ParDo.of(new ParseSinger())) .apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename())) .apply("ParseAlbums", ParDo.of(new ParseAlbum())); .apply("CreateAlbumMutation", ParDo.of(new DoFn<Album, Mutation>() { @ProcessElement public void processElement(ProcessContext c) {
public static void runCsvToAvro(SampleOptions options) throws IOException, IllegalArgumentException { FileSystems.setDefaultPipelineOptions(options); // Get Avro Schema String schemaJson = getSchema(options.getAvroSchema()); Schema schema = new Schema.Parser().parse(schemaJson); // Check schema field types before starting the Dataflow job checkFieldTypes(schema); // Create the Pipeline object with the options we defined above. Pipeline pipeline = Pipeline.create(options); // Convert CSV to Avro pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile())) .apply("Convert CSV to Avro formatted data", ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter()))) .setCoder(AvroCoder.of(GenericRecord.class, schema)) .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson) .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro")); // Run the pipeline. pipeline.run().waitUntilFinish(); }