@Override public PCollection<Row> expand(PCollection<? extends String> jsonStrings) { return jsonStrings .apply( ParDo.of( new DoFn<String, Row>() { @ProcessElement public void processElement(ProcessContext context) { context.output(jsonToRow(objectMapper(), context.element())); } })) .setRowSchema(schema); }
@Override public PCollection<Row> buildIOReader(PBegin begin) { return begin .apply("MockedBoundedTable_Reader_" + COUNTER.incrementAndGet(), Create.of(rows)) .setRowSchema(getSchema()); }
@Override public PCollection<Row> expand(PCollection<KV<byte[], byte[]>> input) { return input .apply( "decodeRecord", ParDo.of( new DoFn<KV<byte[], byte[]>, Row>() { @ProcessElement public void processElement(ProcessContext c) { String rowInString = new String(c.element().getValue(), UTF_8); for (Row row : csvLines2BeamRows(format, rowInString, schema)) { c.output(row); } } })) .setRowSchema(schema); } }
@Override public PCollection<Row> expand(PCollectionList<Row> pinput) { checkArgument( pinput.size() == 0, "Should not have received input for %s: %s", BeamValuesRel.class.getSimpleName(), pinput); if (tuples.isEmpty()) { throw new IllegalStateException("Values with empty tuples!"); } Schema schema = CalciteUtils.toSchema(getRowType()); List<Row> rows = tuples.stream().map(tuple -> tupleToRow(schema, tuple)).collect(toList()); return pinput.getPipeline().begin().apply(Create.of(rows)).setRowSchema(schema); } }
@Override public PCollection<Row> expand(PCollection<String> input) { return input .apply( "csvToRow", FlatMapElements.into(TypeDescriptors.rows()) .via(s -> csvLines2BeamRows(csvFormat, s, schema))) .setRowSchema(schema); } }
@Override public PCollection<Row> buildIOReader(PBegin begin) { // TODO: make this more generic. return begin .apply(BigQueryIO.read(BigQueryUtils.toBeamRow(schema)).from(tableSpec)) .setRowSchema(getSchema()); }
@Override public PCollection<Row> expand(PCollectionList<Row> pinput) { // The set of rows where we run the correlated unnest for each row PCollection<Row> outer = pinput.get(0); Schema joinedSchema = CalciteUtils.toSchema(rowType); return outer .apply(ParDo.of(new UnnestFn(joinedSchema, unnestIndex))) .setRowSchema(joinedSchema); } }
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); Schema outputSchema = getUnnestedSchema(inputSchema, getFieldNameFunction()); return input .apply( ParDo.of( new DoFn<T, Row>() { @ProcessElement public void processElement(@Element Row row, OutputReceiver<Row> o) { o.output(unnestRow(row, outputSchema)); } })) .setRowSchema(outputSchema); } }
private PCollection<Row> queryFromOrderTables(String sql) { return tuple( "ORDER_DETAILS1", ORDER_DETAILS1.buildIOReader(pipeline.begin()).setRowSchema(SOURCE_ROW_TYPE), "ORDER_DETAILS2", ORDER_DETAILS2.buildIOReader(pipeline.begin()).setRowSchema(SOURCE_ROW_TYPE)) .apply("join", SqlTransform.query(sql)) .setRowSchema(RESULT_ROW_TYPE); } }
@Override public PCollection<Row> expand(PCollectionList<Row> pinput) { checkArgument( pinput.size() == 1, "Wrong number of inputs for %s: %s", BeamCalcRel.class.getSimpleName(), pinput); PCollection<Row> upstream = pinput.get(0); BeamSqlExpressionExecutor executor = new BeamSqlFnExecutor(BeamCalcRel.this.getProgram()); Schema schema = CalciteUtils.toSchema(rowType); PCollection<Row> projectStream = upstream .apply(ParDo.of(new CalcFn(executor, CalciteUtils.toSchema(rowType)))) .setRowSchema(schema); projectStream.setRowSchema(CalciteUtils.toSchema(getRowType())); return projectStream; } }
@Override public PCollection<Row> expand(PCollection<InputT> input) { SchemaAggregateFn.Inner<InputT> fn = schemaAggregateFn.withSchema(input.getSchema(), input.getToRowFunction()); return input.apply(Combine.globally(fn)).setRowSchema(fn.getOutputSchema()); } }
@Override public PCollection<Row> expand(PCollectionList<Row> pinput) { checkArgument( pinput.size() == 1, "Wrong number of inputs for %s: %s", BeamUncollectRel.class.getSimpleName(), pinput); PCollection<Row> upstream = pinput.get(0); // Each row of the input contains a single array of things to be emitted; Calcite knows // what the row looks like Schema outputSchema = CalciteUtils.toSchema(getRowType()); PCollection<Row> uncollected = upstream.apply(ParDo.of(new UncollectDoFn(outputSchema))).setRowSchema(outputSchema); return uncollected; } }
@Override public PCollection<Row> expand(PCollection<String> input) { return input .apply( "linesToRows", MapElements.into(TypeDescriptors.rows()) .via(s -> Row.withSchema(SCHEMA).addValue(s).build())) .setRowSchema(SCHEMA); } }
@Override public PCollection<Row> expand(PCollection<T> input) { Schema inputSchema = input.getSchema(); verifyCompatibility(inputSchema); return input .apply( ParDo.of( new DoFn<T, Row>() { // TODO: This should be the same as resolved so that Beam knows which fields // are being accessed. Currently Beam only supports wildcard descriptors. // Once BEAM-4457 is fixed, fix this. @FieldAccess("filterFields") final FieldAccessDescriptor fieldAccessDescriptor = FieldAccessDescriptor.withAllFields(); @ProcessElement public void process( @FieldAccess("filterFields") Row input, OutputReceiver<Row> r) { Row output = castRow(input, inputSchema, outputSchema()); r.output(output); } })) .setRowSchema(outputSchema()); }
protected PCollection<Row> getFloorCeilingTestPCollection() { try { return TestBoundedTable.of(ROW_TYPE_THREE) .addRows(parseTimestampWithUTCTimeZone("1986-02-15 11:35:26"), 1.4) .buildIOReader(pipeline.begin()) .setRowSchema(ROW_TYPE_THREE); } catch (Exception e) { throw new RuntimeException(e); } }
@Override public PCollection<Row> buildIOReader(PBegin begin) { TestStream.Builder<Row> values = TestStream.create( schema, SerializableFunctions.identity(), SerializableFunctions.identity()); for (Pair<Duration, List<Row>> pair : timestampedRows) { values = values.advanceWatermarkTo(new Instant(0).plus(pair.getKey())); for (int i = 0; i < pair.getValue().size(); i++) { values = values.addElements( TimestampedValue.of( pair.getValue().get(i), new Instant(pair.getValue().get(i).getDateTime(timestampField)))); } } return begin .apply( "MockedUnboundedTable_" + COUNTER.incrementAndGet(), values.advanceWatermarkToInfinity()) .setRowSchema(getSchema()); } }
private PCollection<Row> sideInputJoinHelper( JoinRelType joinType, PCollection<KV<Row, Row>> leftRows, PCollection<KV<Row, Row>> rightRows, Row rightNullRow, boolean swapped) { final PCollectionView<Map<Row, Iterable<Row>>> rowsView = rightRows.apply(View.asMultimap()); Schema schema = CalciteUtils.toSchema(getRowType()); PCollection<Row> ret = leftRows .apply( ParDo.of( new BeamJoinTransforms.SideInputJoinDoFn( joinType, rightNullRow, rowsView, swapped, schema)) .withSideInputs(rowsView)) .setRowSchema(schema); return ret; }
protected PCollection<Row> getTestPCollection() { try { return TestBoundedTable.of(ROW_TYPE) .addRows( parseTimestampWithUTCTimeZone("1986-02-15 11:35:26"), (byte) 1, (short) 1, 1, 1L, 1.0f, 1.0, BigDecimal.ONE, (byte) 127, (short) 32767, 2147483647, 9223372036854775807L) .buildIOReader(pipeline.begin()) .setRowSchema(ROW_TYPE); } catch (Exception e) { throw new RuntimeException(e); } }
@Override public PCollection<Row> buildIOReader(PBegin begin) { PCollectionTuple rowsWithDlq = begin .apply("readFromPubsub", readMessagesWithAttributes()) .apply("parseMessageToRow", createParserParDo()); rowsWithDlq.get(MAIN_TAG).setRowSchema(getSchema()); if (useDlq()) { rowsWithDlq.get(DLQ_TAG).apply(writeMessagesToDlq()); } return rowsWithDlq.get(MAIN_TAG); }
@Test @Category(NeedsRunner.class) public void testMismatchingKeys() { PCollection<Row> pc1 = pipeline .apply( "Create1", Create.of(Row.withSchema(CG_SCHEMA_1).addValues("user1", 1, "us").build())) .setRowSchema(CG_SCHEMA_1); PCollection<Row> pc2 = pipeline .apply( "Create2", Create.of(Row.withSchema(CG_SCHEMA_1).addValues("user1", 9, "us").build())) .setRowSchema(CG_SCHEMA_1); TupleTag<Row> pc1Tag = new TupleTag<>("pc1"); TupleTag<Row> pc2Tag = new TupleTag<>("pc2"); thrown.expect(IllegalStateException.class); PCollection<KV<Row, Row>> joined = PCollectionTuple.of(pc1Tag, pc1) .and(pc2Tag, pc2) .apply("CoGroup", CoGroup.byFieldNames(pc1Tag, "user").byFieldNames(pc2Tag, "count")); pipeline.run(); }