@Override public PCollection<KV<K, OutputT>> expand(PCollection<KV<K, InputT>> input) { return input .apply(fewKeys ? GroupByKey.createWithFewKeys() : GroupByKey.create()) .apply( Combine.<K, InputT, OutputT>groupedValues(fn, fnDisplayData) .withSideInputs(sideInputs)); }
@Override public PCollection<KV<Contig, Iterable<Read>>> expand(PCollection<Read> reads) { return reads .apply("KeyReads", ParDo.of(new KeyReadsFn())) .apply(GroupByKey.<Contig, Read>create()); }
@Override public PCollection<Iterable<InputT>> expand(PCollection<InputT> input) { return input .apply(WithKeys.of((Void) null)) .apply(GroupByKey.create()) .apply(Values.create()); } }
@Test public void testDisplayData() { GroupByKey<String, String> groupByKey = GroupByKey.create(); GroupByKey<String, String> groupByFewKeys = GroupByKey.createWithFewKeys(); DisplayData gbkDisplayData = DisplayData.from(groupByKey); DisplayData fewKeysDisplayData = DisplayData.from(groupByFewKeys); assertThat(gbkDisplayData.items(), empty()); assertThat(fewKeysDisplayData, hasDisplayItem("fewKeys", true)); }
/** * Tests that the translator is registered so the URN can be retrieved (the only thing you can * meaningfully do with a {@link GroupByKey}). */ @Test public void testUrnRetrievable() throws Exception { assertThat( PTransformTranslation.urnForTransform(GroupByKey.create()), equalTo(GROUP_BY_KEY_TRANSFORM_URN)); } }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply("Break fusion mapper", ParDo.of(new DummyMapFn<T>())) .apply(GroupByKey.<T, Integer>create()) .apply(Keys.<T>create()); }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply("Break fusion mapper", ParDo.of(new DummyMapFn<T>())) .apply(GroupByKey.<T, Integer>create()) .apply(Keys.<T>create()); }
/** * @param input PCollection of variants to process. * @return PCollection of variant-only Variant objects with calls from non-variant-segments * merged into the SNP variants with which they overlap. */ @Override public PCollection<Variant> expand(PCollection<Variant> input) { return input .apply(ParDo.of(new BinVariantsFn())) .apply(GroupByKey.<KV<String, Long>, Variant>create()) .apply(ParDo.of(new CombineVariantsFn())); }
/** * @param input PCollection of variants to process. * @return PCollection of variant-only Variant objects with calls from non-variant-segments * merged into the SNP variants with which they overlap. */ @Override public PCollection<Variant> expand(PCollection<Variant> input) { return input .apply(ParDo.of(new BinVariantsFn())) .apply(GroupByKey.<KV<String, Long>, Variant>create()) .apply(ParDo.of(new CombineVariantsFn())); }
@Override public PCollection<TableRow> expand(PCollection<KV<String, StationSpeed>> stationSpeed) { // Apply a GroupByKey transform to collect a list of all station // readings for a given route. PCollection<KV<String, Iterable<StationSpeed>>> timeGroup = stationSpeed.apply(GroupByKey.create()); // Analyze 'slowdown' over the route readings. PCollection<KV<String, RouteInfo>> stats = timeGroup.apply(ParDo.of(new GatherStats())); // Format the results for writing to BigQuery PCollection<TableRow> results = stats.apply(ParDo.of(new FormatStatsFn())); return results; } }
private PCollection<Result<DestinationT>> writeShardedRecords( PCollection<KV<ShardedKey<DestinationT>, TableRow>> shardedRecords, PCollectionView<String> tempFilePrefix) { return shardedRecords .apply("GroupByDestination", GroupByKey.create()) .apply( "WriteGroupedRecords", ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)) .withSideInputs(tempFilePrefix)) .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder)); }
@Override public PCollection<Read> expand(PCollection<String> readGroupSetIds) { return readGroupSetIds.apply(ParDo.of(new CreateReadRequests())) // Force a shuffle operation here to break the fusion of these steps. // By breaking fusion, the work will be distributed to all available workers. .apply(GroupByKey.<Integer, StreamReadsRequest>create()) .apply(ParDo.of(new ConvergeStreamReadsRequestList())) .apply(new ReadStreamer(auth, ShardBoundary.Requirement.STRICT, fields)); }
@Override public PCollection<Read> expand(PCollection<String> readGroupSetIds) { return readGroupSetIds.apply(ParDo.of(new CreateReadRequests())) // Force a shuffle operation here to break the fusion of these steps. // By breaking fusion, the work will be distributed to all available workers. .apply(GroupByKey.<Integer, StreamReadsRequest>create()) .apply(ParDo.of(new ConvergeStreamReadsRequestList())) .apply(new ReadStreamer(auth, ShardBoundary.Requirement.STRICT, fields)); }
@Override public PCollection<ElemT> expand(final PCollection<ElemT> input) { input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())) .apply(GroupByKey.create()) .apply(Values.create()) .apply(new WriteView<>(view)); return input; } }
@Override public PCollection<KV<Integer, KV<KeyT, ValueT>>> expand(PCollection<KV<KeyT, ValueT>> input) { return input .apply( "AssignTask", ParDo.of(new AssignTaskFn<KeyT, ValueT>(configView)).withSideInputs(configView)) .setTypeDescriptor( TypeDescriptors.kvs(TypeDescriptors.integers(), input.getTypeDescriptor())) .apply("GroupByTaskId", GroupByKey.create()) .apply("FlattenGroupedTasks", ParDo.of(new FlattenGroupedTasks<>())); } }
@Test public void keyedInputWithoutKeyPreserving() { PCollection<KV<String, Iterable<Integer>>> onceKeyed = p.apply(Create.of(KV.of("hello", 42))) .apply(GroupByKey.create()) .apply(ParDo.of(new IdentityFn<>())); p.traverseTopologically(visitor); assertThat(visitor.getKeyedPValues(), not(hasItem(onceKeyed))); }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply(WithKeys.of("")) .apply(GroupByKey.create()) .apply(Values.create()) .apply(Flatten.iterables()); } }
@Override public PCollection<T> expand(PCollection<T> input) { return input .apply(window) .apply(ParDo.of(new AddArbitraryKey<>())) .apply(GroupByKey.create()) .apply(ParDo.of(new RemoveArbitraryKey<>())); } }
@Test @Category(ValidatesRunner.class) public void testGroupByKeyEmpty() { List<KV<String, Integer>> ungroupedPairs = Arrays.asList(); PCollection<KV<String, Integer>> input = p.apply( Create.of(ungroupedPairs) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))); PCollection<KV<String, Iterable<Integer>>> output = input.apply(GroupByKey.create()); PAssert.that(output).empty(); p.run(); }
/** Creates a simple pipeline with a {@link Combine.GroupedValues}. */ private static TestPipeline createCombineGroupedValuesPipeline() { TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false); PCollection<KV<String, Integer>> input = pipeline .apply(Create.of(KV.of("key", 1))) .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of())); input.apply(GroupByKey.create()).apply(Combine.groupedValues(new SumCombineFn())); return pipeline; }