private PCollection<Row> createPCollection(Pipeline pipeline, Row... rows) { return pipeline.apply( Create.of(Arrays.asList(rows)) .withSchema( SOURCE_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); }
private PCollection<Row> pCollectionOf2Elements() { return pipeline.apply( "boundedInput1", Create.of( Row.withSchema(INPUT_SCHEMA) .addValues(1) .addArray(Arrays.asList("111", "222")) .build(), Row.withSchema(INPUT_SCHEMA) .addValues(2) .addArray(Arrays.asList("33", "44", "55")) .build()) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); } }
@Test public void testCreateExplicitSchema() { PCollection<String> out = p.apply( Create.of("a", "b", "c", "d") .withSchema( STRING_SCHEMA, s -> Row.withSchema(STRING_SCHEMA).addValue(s).build(), r -> r.getString("field"))); assertThat(out.getCoder(), instanceOf(SchemaCoder.class)); }
1, Row.withSchema(nestedSchema).addValues(312, "CC", 313).build()) .build()) .withSchema( inputType, SerializableFunctions.identity(), SerializableFunctions.identity()));
1, Row.withSchema(nestedSchema).addValues(312, "CC", 313).build()) .build()) .withSchema( inputType, SerializableFunctions.identity(), SerializableFunctions.identity()));
pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(),
@Test @Category({ValidatesRunner.class, UsesSchema.class}) public void testFieldAccessSchemaPipeline() { List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3)); Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build(); PCollection<String> output = pipeline .apply( Create.of(pojoList) .withSchema( schema, o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))) .apply( ParDo.of( new DoFn<MyPojo, String>() { @FieldAccess("foo") final FieldAccessDescriptor fieldAccess = FieldAccessDescriptor.withAllFields(); @ProcessElement public void process(@FieldAccess("foo") Row row, OutputReceiver<String> r) { r.output(row.getString(0) + ":" + row.getInt32(1)); } })); PAssert.that(output).containsInAnyOrder("a:1", "b:2", "c:3"); pipeline.run(); }
@Test public void testUnnestNamedLiteral() { PCollection<Row> input = pipeline.apply( "boundedInput1", Create.empty(TypeDescriptor.of(Row.class)) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); // Because we have a multi-part FROM the DSL considers it multi-input TupleTag<Row> mainTag = new TupleTag<Row>("main") {}; PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input); Schema resultType = Schema.builder().addStringField("f_string").build(); PCollection<Row> result = inputTuple.apply( "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c']) AS t(f_string)")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues("a").build(), Row.withSchema(resultType).addValues("b").build(), Row.withSchema(resultType).addValues("c").build()); pipeline.run(); }
@Test public void testCastToDate() { PCollection<Row> input = pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); Schema resultType = Schema.builder().addInt32Field("f_int").addNullableField("f_date", DATETIME).build(); PCollection<Row> result = input.apply( SqlTransform.query( "SELECT f_int, \n" + " CAST( \n" + " SUBSTRING(TRIM(f_string) FROM 1 FOR 4) \n" + " ||'-' \n" + " ||SUBSTRING(TRIM(f_string) FROM 5 FOR 2) \n" + " ||'-' \n" + " ||SUBSTRING(TRIM(f_string) FROM 7 FOR 2) as DATE) \n" + "FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0)).build()); pipeline.run(); }
@Test @Category({ValidatesRunner.class, UsesSchema.class}) public void testSimpleSchemaPipeline() { List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3)); Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build(); PCollection<String> output = pipeline .apply( Create.of(pojoList) .withSchema( schema, o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))) .apply( ParDo.of( new DoFn<MyPojo, String>() { @ProcessElement public void process(@Element Row row, OutputReceiver<String> r) { r.output(row.getString(0) + ":" + row.getInt32(1)); } })); PAssert.that(output).containsInAnyOrder("a:1", "b:2", "c:3"); pipeline.run(); }
@Test @Category({ValidatesRunner.class, UsesSchema.class}) public void testUnmatchedSchema() { List<MyPojo> pojoList = Lists.newArrayList(new MyPojo("a", 1), new MyPojo("b", 2), new MyPojo("c", 3)); Schema schema = Schema.builder().addStringField("string_field").addInt32Field("integer_field").build(); thrown.expect(IllegalArgumentException.class); pipeline .apply( Create.of(pojoList) .withSchema( schema, o -> Row.withSchema(schema).addValues(o.stringField, o.integerField).build(), r -> new MyPojo(r.getString("string_field"), r.getInt32("integer_field")))) .apply( ParDo.of( new DoFn<MyPojo, Void>() { @FieldAccess("a") FieldAccessDescriptor fieldAccess = FieldAccessDescriptor.withFieldNames("baad"); @ProcessElement public void process(@FieldAccess("a") Row row) {} })); }
@Test public void testCastToDate2() { PCollection<Row> input = pipeline.apply( Create.of(Row.withSchema(INPUT_ROW_SCHEMA).addValues(1).addValue("20181018").build()) .withSchema( INPUT_ROW_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); Schema resultType = Schema.builder().addInt32Field("f_int").addNullableField("f_date", DATETIME).build(); PCollection<Row> result = input.apply( SqlTransform.query( "SELECT f_int, \n" + " CAST( \n" + " f_string AS DATE) \n" + "FROM PCOLLECTION")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues(1, new DateTime(2018, 10, 18, 0, 0)).build()); pipeline.run(); }
@Test public void testUnnestLiteral() { PCollection<Row> input = pipeline.apply( "boundedInput1", Create.empty(TypeDescriptor.of(Row.class)) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); // Because we have a multi-part FROM the DSL considers it multi-input TupleTag<Row> mainTag = new TupleTag<Row>("main") {}; PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input); Schema resultType = Schema.builder().addStringField("f_string").build(); PCollection<Row> result = inputTuple.apply( "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c'])")); PAssert.that(result) .containsInAnyOrder( Row.withSchema(resultType).addValues("a").build(), Row.withSchema(resultType).addValues("b").build(), Row.withSchema(resultType).addValues("c").build()); pipeline.run(); }
@Before public void setUp() { Schema schema = Schema.builder() .addDoubleField("f_double1") .addDoubleField("f_double2") .addDoubleField("f_double3") .addInt32Field("f_int1") .addInt32Field("f_int2") .addInt32Field("f_int3") .build(); List<Row> rowsInTableB = TestUtils.RowsBuilder.of(schema) .addRows( 3.0, 1.0, 1.0, 3, 1, 0, 4.0, 2.0, 2.0, 4, 2, 0, 5.0, 3.0, 1.0, 5, 3, 0, 6.0, 4.0, 2.0, 6, 4, 0, 8.0, 4.0, 1.0, 8, 4, 0) .getRows(); boundedInput = pipeline.apply( Create.of(rowsInTableB) .withSchema( schema, SerializableFunctions.identity(), SerializableFunctions.identity())); }
@Override public PDone expand(PBegin begin) { PCollection<Boolean> result = begin .apply( Create.of(DUMMY_ROW) .withSchema( DUMMY_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())) .apply(SqlTransform.query("SELECT " + expr)) .apply(MapElements.into(TypeDescriptors.booleans()).via(row -> row.getBoolean(0))); PAssert.that(result) .satisfies( input -> { assertTrue("Test expression is false: " + expr, Iterables.getOnlyElement(input)); return null; }); return PDone.in(begin.getPipeline()); } }
@Test public void testSingleElement() throws Exception { Row inputRow = Row.withSchema(INPUT_SCHEMA).addValues(1).addArray(Arrays.asList("111")).build(); PCollection<Row> input = pipeline.apply( "boundedInput1", Create.of(inputRow) .withSchema( INPUT_SCHEMA, SerializableFunctions.identity(), SerializableFunctions.identity())); Schema resultType = Schema.builder().addStringField("f_arrElem").build(); PCollection<Row> result = input.apply("sqlQuery", SqlTransform.query("SELECT ELEMENT(f_stringArr) FROM PCOLLECTION")); PAssert.that(result).containsInAnyOrder(Row.withSchema(resultType).addValues("111").build()); pipeline.run(); }
private PCollection<Row> pCollectionOf2Elements() { return pipeline.apply( "boundedInput1", Create.of( Row.withSchema(INPUT_ROW_TYPE) .addValues(1) .addValue(ImmutableMap.of("key11", 11, "key22", 22)) .build(), Row.withSchema(INPUT_ROW_TYPE) .addValues(2) .addValue(ImmutableMap.of("key33", 33, "key44", 44, "key55", 55)) .build()) .withSchema( INPUT_ROW_TYPE, SerializableFunctions.identity(), SerializableFunctions.identity())); } }
@Before public void setUp() { Schema schema = Schema.builder() .addNullableField("f_int1", Schema.FieldType.INT32) .addNullableField("f_int2", Schema.FieldType.INT32) .addInt32Field("f_int3") .build(); List<Row> rows = TestUtils.RowsBuilder.of(schema) .addRows(1, 5, 1) .addRows(null, 1, 1) .addRows(2, 1, 1) .addRows(null, 1, 1) .addRows(null, null, 1) .addRows(null, null, 1) .addRows(3, 2, 1) .getRows(); boundedInput = PBegin.in(pipeline).apply(Create.of(rows).withSchema(schema, identity(), identity())); }
@Before public void setUp() { Schema schema = Schema.builder() .addInt32Field("f_int") .addDoubleField("f_double") .addInt32Field("f_int2") .build(); List<Row> rowsInTableB = TestUtils.RowsBuilder.of(schema) .addRows( 1, 1.0, 0, 4, 4.0, 0, 7, 7.0, 0, 13, 13.0, 0, 5, 5.0, 0, 10, 10.0, 0, 17, 17.0, 0) .getRows(); boundedInput = pipeline.apply( Create.of(rowsInTableB) .withSchema( schema, SerializableFunctions.identity(), SerializableFunctions.identity())); }
@Test @Category(NeedsRunner.class) public void testFromRows() { PCollection<POJO1> pojos = pipeline .apply( Create.of(EXPECTED_ROW1) .withSchema( EXPECTED_SCHEMA1, SerializableFunctions.identity(), SerializableFunctions.identity())) .apply(Convert.fromRows(POJO1.class)); PAssert.that(pojos).containsInAnyOrder(new POJO1()); pipeline.run(); }