org.apache.beam.sdk.Pipeline.apply java code examples

Refine search

PCollection.apply

public static void runAvroToCsv(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert Avro To CSV
 pipeline.apply("Read Avro files",
   AvroIO.readGenericRecords(schemaJson).from(options.getInputFile()))
   .apply("Convert Avro to CSV formatted data",
     ParDo.of(new ConvertAvroToCsv(schemaJson, options.getCsvDelimiter())))
   .apply("Write CSV formatted data", TextIO.write().to(options.getOutput())
     .withSuffix(".csv"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

public static void runCsvToAvro(SampleOptions options)
  throws IOException, IllegalArgumentException {
 FileSystems.setDefaultPipelineOptions(options);
 // Get Avro Schema
 String schemaJson = getSchema(options.getAvroSchema());
 Schema schema = new Schema.Parser().parse(schemaJson);
 // Check schema field types before starting the Dataflow job
 checkFieldTypes(schema);
 // Create the Pipeline object with the options we defined above.
 Pipeline pipeline = Pipeline.create(options);
 // Convert CSV to Avro
 pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
   .apply("Convert CSV to Avro formatted data",
     ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
   .setCoder(AvroCoder.of(GenericRecord.class, schema))
   .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
     .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));
 // Run the pipeline.
 pipeline.run().waitUntilFinish();
}

 public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);

  String instanceId = options.getInstanceId();
  String databaseId = options.getDatabaseId();
  // [START spanner_dataflow_read]
  // Query for all the columns and rows in the specified Spanner table
  PCollection<Struct> records = p.apply(
    SpannerIO.read()
      .withInstanceId(instanceId)
      .withDatabaseId(databaseId)
      .withQuery("SELECT * FROM " + options.getTable()));
  // [END spanner_dataflow_read]


  PCollection<Long> tableEstimatedSize = records
    // Estimate the size of every row
    .apply(EstimateSize.create())
    // Sum all the row sizes to get the total estimated size of the table
    .apply(Sum.longsGlobally());

  // Write the total size to a file
  tableEstimatedSize
    .apply(ToString.elements())
    .apply(TextIO.write().to(options.getOutput()).withoutSharding());

  p.run().waitUntilFinish();
 }
}

p.apply("ReadSingers", TextIO.read().from(options.getSingersFilename()))
  .apply("ParseSingers", ParDo.of(new ParseSinger()))
  .apply("CreateSingerMutation", ParDo.of(new DoFn<Singer, Mutation>() {
   @ProcessElement
   public void processElement(ProcessContext c) {
  .apply("WriteSingers", SpannerIO.write()
    .withInstanceId(instanceId)
    .withDatabaseId(databaseId));
  .apply("ReadAlbums", TextIO.read().from(options.getAlbumsFilename()))
  .apply("ParseAlbums", ParDo.of(new ParseAlbum()));

PCollection<Struct> records = p.apply(
  SpannerIO.read()
    .withInstanceId(instanceId)
  .apply(EstimateSize.create())
  .apply(Sum.longsGlobally());
  .apply(ToString.elements())
  .apply(TextIO.write().to(options.getOutput()).withoutSharding());

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 SpannerConfig spannerConfig = SpannerConfig.create()
   .withInstanceId(options.getInstanceId())
   .withDatabaseId(options.getDatabaseId());
 // [START spanner_dataflow_readall]
 PCollection<Struct> allRecords = p.apply(SpannerIO.read()
   .withSpannerConfig(spannerConfig)
   .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t"
     + ".table_catalog = '' AND t.table_schema = ''")).apply(
   MapElements.into(TypeDescriptor.of(ReadOperation.class))
     .via((SerializableFunction<Struct, ReadOperation>) input -> {
      String tableName = input.getString(0);
      return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
     })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig));
 // [END spanner_dataflow_readall]
 PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create())
   .apply(Sum.longsGlobally());
 dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput())
   .withoutSharding());
 p.run().waitUntilFinish();
}

  .withInstanceId(instanceId)
  .withDatabaseId(databaseId);
PCollectionView<Transaction> tx = p.apply(
  SpannerIO.createTransaction()
    .withSpannerConfig(spannerConfig)
    .withTimestampBound(TimestampBound.strong()));
PCollection<Struct> singers = p.apply(SpannerIO.read()
  .withSpannerConfig(spannerConfig)
  .withQuery("SELECT SingerID, FirstName, LastName FROM Singers")
  .withTransaction(tx));
PCollection<Struct> albums = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig)
  .withQuery("SELECT SingerId, AlbumId, AlbumTitle FROM Albums")
  .withTransaction(tx));
singers.apply(MapElements.via(new SimpleFunction<Struct, String>() {
  return Joiner.on(DELIMITER).join(input.getLong(0), input.getString(1), input.getString(2));
})).apply(TextIO.write().to(options.getSingersFilename()).withoutSharding());
albums.apply(MapElements.via(new SimpleFunction<Struct, String>() {

PCollection<String> suspiciousUserIds = p.apply(TextIO.read().from(usersIdFile));
  .apply(MapElements.via(new SimpleFunction<String, MutationGroup>() {
mutations.apply(SpannerIO.write()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId)

  public static void runPipeline(Pipeline p) {
    System.out.println("Sleep time: " + TearDown.SLEEP_TIME + " ms");

    long tId = Thread.currentThread().getId();
    long beginTs = System.currentTimeMillis();

    p.apply(Create.of("value"))
      .apply(ParDo.of(new LongTearDownFn()));
    p.run().waitUntilFinish();

    long endTs = System.currentTimeMillis();

    System.out.println("Thread #" + tId +  ", run for " + (endTs - beginTs) + " ms");
  }
}

@VisibleForTesting
static Pipeline buildPipeline(ImportOptions opts) {
 Pipeline pipeline = Pipeline.create(Utils.tweakOptions(opts));
 pipeline
   .apply(
     "Read Sequence File",
     Read.from(new ShuffledSource<>(createSource(opts.getSourcePattern()))))
   .apply("Create Mutations", ParDo.of(new HBaseResultToMutationFn()))
   .apply("Write to Bigtable", createSink(opts));
 return pipeline;
}

private static void runReadPipeline(Options options) {
 Pipeline pipeline = Pipeline.create(options);
 pipeline
   .apply("Find files", FileIO.match().filepattern(options.getInput()))
   .apply("Read matched files", FileIO.readMatches())
   .apply("Read parquet files", ParquetIO.readFiles(SCHEMA))
   .apply("Map records to strings", MapElements.into(strings()).via(new GetRecordsFn()));
 pipeline.run();
}

static void runWordCount(WordCountOptions options) {
 Pipeline p = Pipeline.create(options);
 // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
 // static FormatAsTextFn() to the ParDo transform.
 p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
   .apply(new CountWords())
   .apply(MapElements.via(new FormatAsTextFn()))
   .apply("WriteCounts", TextIO.write().to(options.getOutput()));
 p.run().waitUntilFinish();
}

public static void run(Options options) {
 Pipeline p = Pipeline.create(options);
 double samplingThreshold = 0.1;
 p.apply(TextIO.read().from(options.getWikiInput()))
   .apply(MapElements.via(new ParseTableRowJson()))
   .apply(new ComputeTopSessions(samplingThreshold))
   .apply("Write", TextIO.write().to(options.getOutput()));
 p.run().waitUntilFinish();
}

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation()
   .as(Options.class);
 options.setRunner(FlinkRunner.class);
 Pipeline p = Pipeline.create(options);
 p.apply("ReadLines", TextIO.Read.from(options.getInput()))
   .apply(new CountWords())
   .apply(MapElements.via(new FormatAsTextFn()))
   .apply("WriteCounts", TextIO.Write.to(options.getOutput()));
 p.run();
}

public static void main(String[] args) {
 Pipeline p = initializePipeline(args);
 KafkaOptions options = getOptions(p);
 PCollection<String> words =
   p.apply(Create.of("These", "are", "some", "words"));
 FlinkKafkaProducer08<String> kafkaSink =
   new FlinkKafkaProducer08<>(options.getKafkaTopic(),
     new SimpleStringSchema(), getKafkaProps(options));
 words.apply(Write.to(UnboundedFlinkSink.of(kafkaSink)));
 p.run();
}

 @Override
 public PCollection<Integer> expand(PCollection<Integer> input) {
  // Apply an operation so that this is a composite transform.
  input.apply(Count.perElement());
  // Return a value unrelated to the input.
  return input.getPipeline().apply(Create.of(1, 2, 3, 4));
 }
}

 public static void main(String[] args) throws Exception {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);
  // the following two 'applys' create multiple inputs to our pipeline, one for each
  // of our two input sources.
  PCollection<TableRow> eventsTable =
    p.apply(BigQueryIO.readTableRows().from(GDELT_EVENTS_TABLE));
  PCollection<TableRow> countryCodes = p.apply(BigQueryIO.readTableRows().from(COUNTRY_CODES));
  PCollection<String> formattedResults = joinEvents(eventsTable, countryCodes);
  formattedResults.apply(TextIO.write().to(options.getOutput()));
  p.run().waitUntilFinish();
 }
}

static void runTfIdf(Options options) throws Exception {
 Pipeline pipeline = Pipeline.create(options);
 pipeline.getCoderRegistry().registerCoderForClass(URI.class, StringDelegateCoder.of(URI.class));
 pipeline
   .apply(new ReadDocuments(listInputDocuments(options)))
   .apply(new ComputeTfIdf())
   .apply(new WriteTfIdf(options.getOutput()));
 pipeline.run().waitUntilFinish();
}

 public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args).withValidation()
   .as(WordCountOptions.class);
  Pipeline p = Pipeline.create(options);
  p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
   .apply(new CountWords())
   .apply(MapElements.via(new FormatAsTextFn()))
   .apply("WriteCounts", TextIO.write().to(options.getOutput()));

  p.run();
 }
}

@Test
public void testTransformTranslatorMissing() throws IOException {
 DataflowPipelineOptions options = buildPipelineOptions();
 Pipeline p = Pipeline.create(options);
 p.apply(Create.of(Arrays.asList(1, 2, 3))).apply(new TestTransform());
 thrown.expect(IllegalStateException.class);
 thrown.expectMessage(containsString("no translator registered"));
 DataflowPipelineTranslator.fromOptions(options)
   .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList());
 ArgumentCaptor<Job> jobCaptor = ArgumentCaptor.forClass(Job.class);
 Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture());
 assertValidJob(jobCaptor.getValue());
}

Javadoc

Adds a root PTransform, such as Read or Create, to this Pipeline.

The node in the Pipeline graph will use the provided name. This name is used in various places, including the monitoring UI, logging, and to stably identify this node in the Pipeline graph upon update.

Alias for begin().apply(name, root).

Popular methods of Pipeline

create
Constructs a pipeline from the provided PipelineOptions.
run
Runs this Pipeline using the given PipelineOptions, using the runner specified by the options.
getCoderRegistry
Returns the CoderRegistry that this Pipeline uses.
traverseTopologically
For internal use only; no backwards-compatibility guarantees.Invokes the PipelineVisitor PipelineVis
begin
Returns a PBegin owned by this Pipeline. This serves as the input of a root PTransform such as Read
replaceAll
For internal use only; no backwards-compatibility guarantees.Replaces all nodes that match a PTransf
getOptions
applyTransform
For internal use only; no backwards-compatibility guarantees.Like #applyTransform(String,PInput,PTra
<init>
validate
applyInternal
Applies a PTransform to the given PInput.
applyReplacement

Popular in Java

Start an intent from android
onCreateOptionsMenu (Activity)
getResourceAsStream (ClassLoader)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
SQLException (java.sql)
An exception that indicates a failed JDBC operation. It provides the following information about pro
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
JList (javax.swing)
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Top Vim plugins

How to use applymethodin org.apache.beam.sdk.Pipeline

Best Java code snippets using org.apache.beam.sdk.Pipeline.apply (Showing top 20 results out of 567)

Refine search

How to use
apply
method
in
org.apache.beam.sdk.Pipeline