org.apache.beam.sdk.io.gcp.spanner.SpannerIO java code examples

  .withDatabaseId(databaseId);
PCollectionView<Transaction> tx = p.apply(
  SpannerIO.createTransaction()
    .withSpannerConfig(spannerConfig)
    .withTimestampBound(TimestampBound.strong()));
PCollection<Struct> singers = p.apply(SpannerIO.read()
  .withSpannerConfig(spannerConfig)
  .withQuery("SELECT SingerID, FirstName, LastName FROM Singers")
  .withTransaction(tx));
PCollection<Struct> albums = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig)
  .withQuery("SELECT SingerId, AlbumId, AlbumTitle FROM Albums")
  .withTransaction(tx));

public static void main(String[] args) {
 Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
 Pipeline p = Pipeline.create(options);
 SpannerConfig spannerConfig = SpannerConfig.create()
   .withInstanceId(options.getInstanceId())
   .withDatabaseId(options.getDatabaseId());
 // [START spanner_dataflow_readall]
 PCollection<Struct> allRecords = p.apply(SpannerIO.read()
   .withSpannerConfig(spannerConfig)
   .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t"
     + ".table_catalog = '' AND t.table_schema = ''")).apply(
   MapElements.into(TypeDescriptor.of(ReadOperation.class))
     .via((SerializableFunction<Struct, ReadOperation>) input -> {
      String tableName = input.getString(0);
      return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
     })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig));
 // [END spanner_dataflow_readall]
 PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create())
   .apply(Sum.longsGlobally());
 dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput())
   .withoutSharding());
 p.run().waitUntilFinish();
}

.apply("WriteSingers", SpannerIO.write()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId));
.apply("WriteAlbums", SpannerIO.write()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId));

pipeline.apply(
  "tx",
  SpannerIO.createTransaction()
    .withSpannerConfig(spannerConfig)
    .withTimestampBound(timestampBound));
  "read all", SpannerIO.readAll().withSpannerConfig(spannerConfig).withTransaction(tx));

@Test
public void testReadAllRecordsInDb() throws Exception {
 SpannerConfig spannerConfig = createSpannerConfig();
 PCollectionView<Transaction> tx =
   p.apply(
     SpannerIO.createTransaction()
       .withSpannerConfig(spannerConfig)
       .withTimestampBound(TimestampBound.strong()));
 PCollection<Struct> allRecords =
   p.apply(
       SpannerIO.read()
         .withSpannerConfig(spannerConfig)
         .withBatching(false)
         .withQuery(
           "SELECT t.table_name FROM information_schema.tables AS t WHERE t"
             + ".table_catalog = '' AND t.table_schema = ''"))
     .apply(
       MapElements.into(TypeDescriptor.of(ReadOperation.class))
         .via(
           (SerializableFunction<Struct, ReadOperation>)
             input -> {
              String tableName = input.getString(0);
              return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
             }))
     .apply(SpannerIO.readAll().withTransaction(tx).withSpannerConfig(spannerConfig));
 PAssert.thatSingleton(allRecords.apply("Count rows", Count.globally())).isEqualTo(5L);
 p.run();
}

 begin.apply(SpannerIO.createTransaction().withSpannerConfig(spannerConfig));
    .apply("Wait for previous depth " + depth, Wait.on(previousComputation))
    .apply(
      "Write mutations " + depth, SpannerIO.write().withSpannerConfig(spannerConfig));
previousComputation = result.getOutput();

@Override
public PCollection<Struct> expand(PCollection<ReadOperation> input) {
 PCollectionView<Transaction> txView = getTxView();
 if (txView == null) {
  Pipeline begin = input.getPipeline();
  SpannerIO.CreateTransaction createTx =
    SpannerIO.createTransaction()
      .withSpannerConfig(getSpannerConfig())
      .withTimestampBound(getTimestampBound());
  txView = begin.apply(createTx);
 }
 return input.apply(
   "Naive read from Cloud Spanner",
   ParDo.of(new NaiveSpannerReadFn(getSpannerConfig(), txView)).withSideInputs(txView));
}

readAll()
  .withSpannerConfig(getSpannerConfig())
  .withTimestampBound(getTimestampBound())

 public static void main(String[] args) {
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline p = Pipeline.create(options);

  String instanceId = options.getInstanceId();
  String databaseId = options.getDatabaseId();
  // [START spanner_dataflow_read]
  // Query for all the columns and rows in the specified Spanner table
  PCollection<Struct> records = p.apply(
    SpannerIO.read()
      .withInstanceId(instanceId)
      .withDatabaseId(databaseId)
      .withQuery("SELECT * FROM " + options.getTable()));
  // [END spanner_dataflow_read]


  PCollection<Long> tableEstimatedSize = records
    // Estimate the size of every row
    .apply(EstimateSize.create())
    // Sum all the row sizes to get the total estimated size of the table
    .apply(Sum.longsGlobally());

  // Write the total size to a file
  tableEstimatedSize
    .apply(ToString.elements())
    .apply(TextIO.write().to(options.getOutput()).withoutSharding());

  p.run().waitUntilFinish();
 }
}

Pipeline p = begin.getPipeline();
PCollectionView<Transaction> tx =
  p.apply(SpannerIO.createTransaction().withSpannerConfig(spannerConfig));
PCollection<Ddl> ddl =
  p.apply("Read Information Schema", new ReadInformationSchema(spannerConfig, tx));
  tables.apply(
    "Read all rows from Spanner",
    SpannerIO.readAll().withTransaction(tx).withSpannerConfig(spannerConfig));

@Override
public PCollection<Struct> expand(PCollection<ReadOperation> input) {
 PCollectionView<Transaction> txView = getTxView();
 if (txView == null) {
  Pipeline begin = input.getPipeline();
  SpannerIO.CreateTransaction createTx =
    SpannerIO.createTransaction()
      .withSpannerConfig(getSpannerConfig())
      .withTimestampBound(getTimestampBound());
  txView = begin.apply(createTx);
 }
 return input
   .apply(
     "Generate Partitions",
     ParDo.of(new GeneratePartitionsFn(getSpannerConfig(), txView)).withSideInputs(txView))
   .apply("Shuffle partitions", Reshuffle.<Partition>viaRandomKey())
   .apply(
     "Read from Partitions",
     ParDo.of(new ReadFromPartitionFn(getSpannerConfig(), txView)).withSideInputs(txView));
}

.apply("Read all records", SpannerIO.readAll().withSpannerConfig(spannerConfig))
.apply(
  "Struct To Csv",

SpannerIO.read()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId)

mutations.apply(SpannerIO.write()
  .withInstanceId(instanceId)
  .withDatabaseId(databaseId)

@Test
public void testQuery() throws Exception {
 SpannerConfig spannerConfig = createSpannerConfig();
 PCollectionView<Transaction> tx =
   p.apply(
     SpannerIO.createTransaction()
       .withSpannerConfig(spannerConfig)
       .withTimestampBound(TimestampBound.strong()));
 PCollection<Struct> output =
   p.apply(
     SpannerIO.read()
       .withSpannerConfig(spannerConfig)
       .withQuery("SELECT * FROM " + options.getTable())
       .withTransaction(tx));
 PAssert.thatSingleton(output.apply("Count rows", Count.globally())).isEqualTo(5L);
 p.run();
}

@Test
public void runQuery() throws Exception {
 SpannerIO.Read read =
   SpannerIO.read()
     .withProjectId("test")
     .withInstanceId("123")
     .withDatabaseId("aaa")
     .withQuery("SELECT * FROM users")
     .withServiceFactory(serviceFactory);
 List<Partition> fakePartitions =
   Arrays.asList(mock(Partition.class), mock(Partition.class), mock(Partition.class));
 BatchTransactionId id = mock(BatchTransactionId.class);
 Transaction tx = Transaction.create(id);
 PCollectionView<Transaction> txView =
   pipeline.apply(Create.of(tx)).apply(View.<Transaction>asSingleton());
 BatchSpannerRead.GeneratePartitionsFn fn =
   new BatchSpannerRead.GeneratePartitionsFn(read.getSpannerConfig(), txView);
 DoFnTester<ReadOperation, Partition> fnTester = DoFnTester.of(fn);
 fnTester.setSideInput(txView, GlobalWindow.INSTANCE, tx);
 when(serviceFactory.mockBatchClient().batchReadOnlyTransaction(id)).thenReturn(mockBatchTx);
 when(mockBatchTx.partitionQuery(any(PartitionOptions.class), any(Statement.class)))
   .thenReturn(fakePartitions);
 List<Partition> result = fnTester.processBundle(read.getReadOperation());
 assertThat(result, Matchers.containsInAnyOrder(fakePartitions.toArray()));
 verify(serviceFactory.mockBatchClient()).batchReadOnlyTransaction(id);
 verify(mockBatchTx)
   .partitionQuery(any(PartitionOptions.class), eq(Statement.of("SELECT * " + "FROM users")));
}

@Test
public void emptyTransform() throws Exception {
 SpannerIO.Write write = SpannerIO.write();
 thrown.expect(NullPointerException.class);
 thrown.expectMessage("requires instance id to be set with");
 write.expand(null);
}

@Test
public void testRead() throws Exception {
 SpannerConfig spannerConfig = createSpannerConfig();
 PCollectionView<Transaction> tx =
   p.apply(
     SpannerIO.createTransaction()
       .withSpannerConfig(spannerConfig)
       .withTimestampBound(TimestampBound.strong()));
 PCollection<Struct> output =
   p.apply(
     SpannerIO.read()
       .withSpannerConfig(spannerConfig)
       .withTable(options.getTable())
       .withColumns("Key", "Value")
       .withTransaction(tx));
 PAssert.thatSingleton(output.apply("Count rows", Count.<Struct>globally())).isEqualTo(5L);
 p.run();
}

@Test
public void runRead() throws Exception {
 SpannerIO.Read read =
   SpannerIO.read()
     .withProjectId("test")
     .withInstanceId("123")

@Test
public void emptyDatabaseId() throws Exception {
 SpannerIO.Write write = SpannerIO.write().withInstanceId("123");
 thrown.expect(NullPointerException.class);
 thrown.expectMessage("requires database id to be set with");
 write.expand(null);
}

Javadoc

Experimental PTransform for reading from and writing to Google Cloud Spanner.

Reading from Cloud Spanner

To read from Cloud Spanner, apply SpannerIO.Read transformation. It will return a PCollection of Struct, where each element represents an individual row returned from the read operation. Both Query and Read APIs are supported. See more information about reading from Cloud Spanner

To execute a query, specify a SpannerIO.Read#withQuery(Statement) or SpannerIO.Read#withQuery(String) during the construction of the transform.

 
PCollection rows = p.apply(

To use the Read API, specify a SpannerIO.Read#withTable(String) and a SpannerIO.Read#withColumns(List).

 
PCollection rows = p.apply(

To optimally read using index, specify the index name using SpannerIO.Read#withIndex.

The transform is guaranteed to be executed on a consistent snapshot of data, utilizing the power of read only transactions. Staleness of data can be controlled using SpannerIO.Read#withTimestampBound or SpannerIO.Read#withTimestamp(Timestamp) methods. Read more about transactions in Cloud Spanner.

It is possible to read several PCollection within a single transaction. Apply SpannerIO#createTransaction() transform, that lazily creates a transaction. The result of this transformation can be passed to read operation using SpannerIO.Read#withTransaction(PCollectionView).

 
SpannerConfig spannerConfig = ...

Writing to Cloud Spanner

The Cloud Spanner SpannerIO.Write transform writes to Cloud Spanner by executing a collection of input row Mutation. The mutations are grouped into batches for efficiency.

To configure the write transform, create an instance using #write() and then specify the destination Cloud Spanner instance ( Write#withInstanceId(String) and destination database ( Write#withDatabaseId(String)). For example:

 
// Earlier in the pipeline, create a PCollection of Mutations to be written to Cloud Spanner.

SpannerWriteResult

The SpannerWriteResult object contains the results of the transform, including a PCollection of MutationGroups that failed to write, and a PCollectionthat can be used as a completion signal.

Batching

To reduce the number of transactions sent to Spanner, the Mutation are grouped into batches The default maximum size of the batch is set to 1MB or 5000 mutated cells. To override this use Write#withBatchSizeBytes(long) and Write#withMaxNumMutations(long). Setting either to a small value or zero disables batching.

Note that the maximum size of a single transaction is 20,000 mutated cells - including cells in indexes. If you have a large number of indexes and are getting exceptions with message: INVALID_ARGUMENT: The transaction contains too many mutations you will need to specify a smaller number of MaxNumMutations.

The batches written are obtained from by grouping enough Mutation from the Bundle provided by Beam to form (by default) 1000 batches. This group of Mutation is then sorted by Key, and the batches are created from the sorted group. This so that each batch will have keys that are 'close' to each other to optimise write performance. This grouping factor (number of batches) is controlled by the parameter Write#withGroupingFactor(int).
Note that each worker will need enough memory to hold GroupingFactor x MaxBatchSizeBytesMutations, so if you have a large MaxBatchSize you may need to reduce GroupingFactor

Database Schema Preparation

The Write transform reads the database schema on pipeline start. If the schema is created as part of the same pipeline, this transform needs to wait until this has happened. Use Write#withSchemaReadySignal(PCollection) to pass a signal PCollection which will be used with Wait.OnSignal to prevent the schema from being read until it is ready. The Write transform will be paused until the signal PCollection is closed.

Transactions

The transform does not provide same transactional guarantees as Cloud Spanner. In particular,

Individual Mutations are submitted atomically, but all Mutations are not submitted in the same transaction.
A Mutation is applied at least once;
If the pipeline was unexpectedly stopped, mutations that were already applied will not get rolled back.

Use MutationGroup with the WriteGrouped transform to ensure that a small set mutations is bundled together. It is guaranteed that mutations in a MutationGroup are submitted in the same transaction. Note that a MutationGroup must not exceed the Spanner transaction limits.

 
// Earlier in the pipeline, create a PCollection of MutationGroups to be written to Cloud Spanner.

Streaming Support

SpannerIO.Write can be used as a streaming sink, however as with batch mode note that the write order of individual Mutation/ MutationGroup objects is not guaranteed.

Most used methods

createTransaction
Returns a transform that creates a batch transaction. By default, TimestampBound#strong() transactio
readAll
A PTransform that works like #read, but executes read operations coming from a PCollection.
write
Creates an uninitialized instance of Write. Before use, the Write must be configured with a Write#wi
read
Creates an uninitialized instance of Read. Before use, the Read must be configured with a Read#withI

Popular in Java

Making http post requests using okhttp
setRequestProperty (URLConnection)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
scheduleAtFixedRate (Timer)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
Socket (java.net)
Provides a client-side TCP socket.
Format (java.text)
The base class for all formats. This is an abstract base class which specifies the protocol for clas
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
CodeWhisperer alternatives

How to useSpannerIO in org.apache.beam.sdk.io.gcp.spanner

Best Java code snippets using org.apache.beam.sdk.io.gcp.spanner.SpannerIO (Showing top 20 results out of 315)

How to use
SpannerIO
in
org.apache.beam.sdk.io.gcp.spanner