public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); String instanceId = options.getInstanceId(); String databaseId = options.getDatabaseId(); // [START spanner_dataflow_read] // Query for all the columns and rows in the specified Spanner table PCollection<Struct> records = p.apply( SpannerIO.read() .withInstanceId(instanceId) .withDatabaseId(databaseId) .withQuery("SELECT * FROM " + options.getTable())); // [END spanner_dataflow_read] PCollection<Long> tableEstimatedSize = records // Estimate the size of every row .apply(EstimateSize.create()) // Sum all the row sizes to get the total estimated size of the table .apply(Sum.longsGlobally()); // Write the total size to a file tableEstimatedSize .apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding()); p.run().waitUntilFinish(); } }
.apply(ToString.elements()) .apply(TextIO.write().to(options.getOutput()).withoutSharding());
public static void main(String[] args) { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); SpannerConfig spannerConfig = SpannerConfig.create() .withInstanceId(options.getInstanceId()) .withDatabaseId(options.getDatabaseId()); // [START spanner_dataflow_readall] PCollection<Struct> allRecords = p.apply(SpannerIO.read() .withSpannerConfig(spannerConfig) .withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply( MapElements.into(TypeDescriptor.of(ReadOperation.class)) .via((SerializableFunction<Struct, ReadOperation>) input -> { String tableName = input.getString(0); return ReadOperation.create().withQuery("SELECT * FROM " + tableName); })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig)); // [END spanner_dataflow_readall] PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()) .apply(Sum.longsGlobally()); dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()) .withoutSharding()); p.run().waitUntilFinish(); }
@Test @Category(NeedsRunner.class) public void testToStringOf() { Integer[] ints = {1, 2, 3, 4, 5}; String[] strings = {"1", "2", "3", "4", "5"}; PCollection<Integer> input = p.apply(Create.of(Arrays.asList(ints))); PCollection<String> output = input.apply(ToString.elements()); PAssert.that(output).containsInAnyOrder(strings); p.run(); }
@Test @Category(NeedsRunner.class) public void testReadWatchForNewFiles() throws IOException, InterruptedException { final Path basePath = tempFolder.getRoot().toPath().resolve("readWatch"); basePath.toFile().mkdir(); p.apply(GenerateSequence.from(0).to(10).withRate(1, Duration.millis(100))) .apply( Window.<Long>into(FixedWindows.of(Duration.millis(150))) .withAllowedLateness(Duration.ZERO) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) .discardingFiredPanes()) .apply(ToString.elements()) .apply( TextIO.write() .to(basePath.resolve("data").toString()) .withNumShards(1) .withWindowedWrites()); PCollection<String> lines = p.apply( TextIO.read() .from(basePath.resolve("*").toString()) .watchForNewFiles( Duration.millis(100), Watch.Growth.afterTimeSinceNewOutput(Duration.standardSeconds(3)))); PAssert.that(lines).containsInAnyOrder("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); p.run(); } }