org.apache.beam.sdk.schemas.transforms.Group.byFieldIds(keyFieldsIds); org.apache.beam.sdk.schemas.transforms.Group.CombineFieldsByFields<Row> combined = null; for (FieldAggregation fieldAggregation : fieldAggregations) {
@Test @Category(NeedsRunner.class) public void testGroupGlobally() { Collection<POJO> elements = ImmutableList.of( new POJO("key1", 1, "value1"), new POJO("key1", 1, "value2"), new POJO("key2", 2, "value3"), new POJO("key2", 2, "value4")); PCollection<Iterable<POJO>> grouped = pipeline.apply(Create.of(elements)).apply(Group.globally()); PAssert.that(grouped).satisfies(actual -> containsSingleIterable(elements, actual)); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testOutputCoders() { Schema keySchema = Schema.builder().addStringField("field1").build(); PCollection<KV<Row, Iterable<POJO>>> grouped = pipeline .apply(Create.of(new POJO("key1", 1, "value1"))) .apply(Group.byFieldNames("field1")); // Make sure that the key has the right schema. PCollection<Row> keys = grouped.apply(Keys.create()); assertTrue(keys.getSchema().equivalent(keySchema)); // Make sure that the value has the right schema. PCollection<POJO> values = grouped.apply(Values.create()).apply(Flatten.iterables()); assertTrue(values.getSchema().equivalent(POJO_SCHEMA)); pipeline.run(); }
new OuterPOJO(new POJO("key2", 2L, "value3")), new OuterPOJO(new POJO("key2", 2L, "value4")))) .apply(Group.byFieldAccessDescriptor(groupKeys));
@Test @Category(NeedsRunner.class) public void testPerKeyAggregation() { Collection<POJO> elements = ImmutableList.of( new POJO("key1", 1, "value1"), new POJO("key1", 1, "value2"), new POJO("key2", 2, "value3"), new POJO("key2", 2, "value4"), new POJO("key2", 2, "value4")); PCollection<KV<Row, Long>> count = pipeline .apply(Create.of(elements)) .apply(Group.<POJO>byFieldNames("field1").aggregate(Count.combineFn())); Schema keySchema = Schema.builder().addStringField("field1").build(); Collection<KV<Row, Long>> expectedCounts = ImmutableList.of( KV.of(Row.withSchema(keySchema).addValue("key1").build(), 2L), KV.of(Row.withSchema(keySchema).addValue("key2").build(), 3L)); PAssert.that(count).containsInAnyOrder(expectedCounts); pipeline.run(); }
.apply(Create.of(elements)) .apply( Group.<OuterAggregate>byFieldAccessDescriptor(field2Selector) .aggregateFields(field1Selector, Sum.ofLongs(), "field1_sum") .aggregateFields(field3Selector, Sum.ofIntegers(), "field3_sum")
@Test @Category(NeedsRunner.class) public void testGlobalAggregation() { Collection<POJO> elements = ImmutableList.of( new POJO("key1", 1, "value1"), new POJO("key1", 1, "value2"), new POJO("key2", 2, "value3"), new POJO("key2", 2, "value4")); PCollection<Long> count = pipeline .apply(Create.of(elements)) .apply(Group.<POJO>globally().aggregate(Count.combineFn())); PAssert.that(count).containsInAnyOrder(4L); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testGroupByOneField() { PCollection<KV<Row, Iterable<POJO>>> grouped = pipeline .apply( Create.of( new POJO("key1", 1, "value1"), new POJO("key1", 2, "value2"), new POJO("key2", 3, "value3"), new POJO("key2", 4, "value4"))) .apply(Group.byFieldNames("field1")); Schema keySchema = Schema.builder().addStringField("field1").build(); List<KV<Row, Collection<POJO>>> expected = ImmutableList.of( KV.of( Row.withSchema(keySchema).addValue("key1").build(), ImmutableList.of(new POJO("key1", 1L, "value1"), new POJO("key1", 2L, "value2"))), KV.of( Row.withSchema(keySchema).addValue("key2").build(), ImmutableList.of(new POJO("key2", 3L, "value3"), new POJO("key2", 4L, "value4")))); PAssert.that(grouped).satisfies(actual -> containsKIterableVs(expected, actual, new POJO[0])); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testAggregateByMultipleFields() { Collection<AggregatePojos> elements = ImmutableList.of( new AggregatePojos(1, 1, 2), new AggregatePojos(2, 1, 3), new AggregatePojos(3, 2, 4), new AggregatePojos(4, 2, 5)); List<String> fieldNames = Lists.newArrayList("field1", "field2"); PCollection<Row> aggregate = pipeline .apply(Create.of(elements)) .apply( Group.<AggregatePojos>globally() .aggregateFields(fieldNames, new MultipleFieldCombineFn(), "field1+field2")); Schema outputSchema = Schema.builder().addInt64Field("field1+field2").build(); Row expectedRow = Row.withSchema(outputSchema).addValues(16L).build(); PAssert.that(aggregate).containsInAnyOrder(expectedRow); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testGroupByMultiple() { PCollection<KV<Row, Iterable<POJO>>> grouped = pipeline .apply( Create.of( new POJO("key1", 1, "value1"), new POJO("key1", 1, "value2"), new POJO("key2", 2, "value3"), new POJO("key2", 2, "value4"))) .apply(Group.byFieldNames("field1", "field2")); Schema keySchema = Schema.builder().addStringField("field1").addInt64Field("field2").build(); List<KV<Row, Collection<POJO>>> expected = ImmutableList.of( KV.of( Row.withSchema(keySchema).addValues("key1", 1L).build(), ImmutableList.of(new POJO("key1", 1L, "value1"), new POJO("key1", 1L, "value2"))), KV.of( Row.withSchema(keySchema).addValues("key2", 2L).build(), ImmutableList.of(new POJO("key2", 2L, "value3"), new POJO("key2", 2L, "value4")))); PAssert.that(grouped).satisfies(actual -> containsKIterableVs(expected, actual, new POJO[0])); pipeline.run(); }
@Test @Category(NeedsRunner.class) public void testGloballyWithSchemaAggregateFn() { Collection<AggregatePojos> elements = ImmutableList.of( new AggregatePojos(1, 1, 2), new AggregatePojos(2, 1, 3), new AggregatePojos(3, 2, 4), new AggregatePojos(4, 2, 5)); PCollection<Row> aggregate = pipeline .apply(Create.of(elements)) .apply( Group.<AggregatePojos>globally() .aggregateField("field1", Sum.ofLongs(), "field1_sum") .aggregateField("field3", Sum.ofIntegers(), "field3_sum") .aggregateField("field1", Top.largestLongsFn(1), "field1_top")); Schema aggregateSchema = Schema.builder() .addInt64Field("field1_sum") .addInt32Field("field3_sum") .addArrayField("field1_top", FieldType.INT64) .build(); Row expectedRow = Row.withSchema(aggregateSchema).addValues(10L, 14).addArray(4L).build(); PAssert.that(aggregate).containsInAnyOrder(expectedRow); pipeline.run(); }
.apply(Create.of(elements)) .apply( Group.<AggregatePojos>byFieldNames("field2") .aggregateField("field1", Sum.ofLongs(), "field1_sum") .aggregateField("field3", Sum.ofIntegers(), "field3_sum")
.apply(Create.of(elements)) .apply( Group.<OuterAggregate>globally() .aggregateFields(field1Selector, Sum.ofLongs(), "field1_sum") .aggregateFields(field3Selector, Sum.ofIntegers(), "field3_sum")