@Override public PCollection<KV<URI, String>> expand(PBegin input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply("TextIO.Read(" + uriString + ")", TextIO.Read.from(uriString)) .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri)); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); } }
@Override public PCollection<T> expand(PCollection<ValueWithRecordId<T>> input) { return input .apply( WithKeys.of( (ValueWithRecordId<T> value) -> Arrays.hashCode(value.getId()) % NUM_RESHARD_KEYS) .withKeyType(TypeDescriptors.integers())) // Reshuffle will dedup based on ids in ValueWithRecordId by passing the data through // WindmillSink. .apply(Reshuffle.of()) .apply( "StripIds", ParDo.of( new DoFn<KV<Integer, ValueWithRecordId<T>>, T>() { @ProcessElement public void processElement(ProcessContext c) { c.output(c.element().getValue().getValue()); } })); } }
/** * Returns a {@code PTransform} that takes a {@code PCollection<V>} and returns a {@code * PCollection<KV<K, V>>}, where each of the values in the input {@code PCollection} has been * paired with the given key. */ @SuppressWarnings("unchecked") public static <K, V> WithKeys<K, V> of(@Nullable final K key) { return new WithKeys<>(value -> key, (Class<K>) (key == null ? Void.class : key.getClass())); }
@Override public PCollection<Iterable<InputT>> expand(PCollection<InputT> input) { return input .apply(WithKeys.of((Void) null)) .apply(GroupByKey.create()) .apply(Values.create()); } }
@Override public PCollection<T> expand(PCollection<T> in) { validateWindowStrategy(in.getWindowingStrategy()); WithKeys<IdT, T> withKeys = WithKeys.of(fn); if (representativeType != null) { withKeys = withKeys.withKeyType(representativeType);
/** * Return a {@link WithKeys} that is like this one with the specified key type descriptor. * * <p>For use with lambdas in Java 8, either this method must be called with an appropriate type * descriptor or {@link PCollection#setCoder(Coder)} must be called on the output {@link * PCollection}. */ public WithKeys<K, V> withKeyType(TypeDescriptor<K> keyType) { // Safe cast @SuppressWarnings("unchecked") Class<K> rawType = (Class<K>) keyType.getRawType(); return new WithKeys<>(fn, rawType); }
@Override public PCollection<KV<URI, String>> expand(PBegin input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if ("file".equals(uri.getScheme())) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString)) .apply("WithKeys(" + uriString + ")", WithKeys.of(uri)) .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of())); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.pCollections()); } }
@Test @Category(NeedsRunner.class) public void testWithKeysWithUnneededWithKeyTypeSucceeds() { PCollection<String> input = p.apply(Create.of(Arrays.asList(COLLECTION)).withCoder(StringUtf8Coder.of())); PCollection<KV<Integer, String>> output = input.apply(WithKeys.of(new LengthAsKey()).withKeyType(TypeDescriptor.of(Integer.class))); PAssert.that(output).containsInAnyOrder(WITH_KEYS); p.run(); }
/** * Returns a {@code PTransform} that takes a {@code PCollection<V>} and returns a {@code * PCollection<KV<K, V>>}, where each of the values in the input {@code PCollection} has been * paired with a key computed from the value by invoking the given {@code SerializableFunction}. * * <p>If using a lambda in Java 8, {@link #withKeyType(TypeDescriptor)} must be called on the * result {@link PTransform}. */ public static <K, V> WithKeys<K, V> of(SerializableFunction<V, K> fn) { checkNotNull( fn, "WithKeys constructed with null function. Did you mean WithKeys.of((Void) null)?"); return new WithKeys<>(fn, null); }
@Override public PCollection<T> expand(PCollection<T> input) { Coder<T> coder = input.getCoder(); PCollection<KV<String, T>> keyedRow = input.apply(WithKeys.of("DummyKey")).setCoder(KvCoder.of(StringUtf8Coder.of(), coder)); return keyedRow.apply(ParDo.of(new LimitFn<T>(getCount()))); } }
@Override public PCollection<Iterable<ValueInSingleWindow<T>>> expand(PCollection<T> input) { WindowFn<?, ?> originalWindowFn = input.getWindowingStrategy().getWindowFn(); return input .apply(Reify.windows()) .apply( WithKeys.<Integer, ValueInSingleWindow<T>>of(0) .withKeyType(new TypeDescriptor<Integer>() {})) .apply( Window.into( new IdentityWindowFn<KV<Integer, ValueInSingleWindow<T>>>( originalWindowFn.windowCoder())) .triggering(Never.ever()) .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness()) .discardingFiredPanes()) // all values have the same key so they all appear as a single output element .apply(GroupByKey.create()) .apply(Values.create()) .setWindowingStrategyInternal(input.getWindowingStrategy()); } }
@Before public void setup() { created = p.apply(Create.of(1, 2, 3)); downstream = created.apply(WithKeys.of("foo")); }
@Test @Category(NeedsRunner.class) public void withLambdaAndTypeDescriptorShouldSucceed() { PCollection<String> values = p.apply(Create.of("1234", "3210", "0", "-12")); PCollection<KV<Integer, String>> kvs = values.apply( WithKeys.of((SerializableFunction<String, Integer>) Integer::valueOf) .withKeyType(TypeDescriptor.of(Integer.class))); PAssert.that(kvs) .containsInAnyOrder( KV.of(1234, "1234"), KV.of(0, "0"), KV.of(-12, "-12"), KV.of(3210, "3210")); p.run(); }
@Override public PCollection<ElemT> expand(final PCollection<ElemT> input) { input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())) .apply(GroupByKey.create()) .apply(Values.create()) .apply(new WriteView<>(view)); return input; } }
.withCoder(StringUtf8Coder.of())) .apply( WithKeys.<String, String>of(input12 -> input12) .withKeyType(TypeDescriptors.strings())) .apply("ReifyOriginalTimestamps", Reify.timestampsInValue());
@Override public POutput expand(PCollection<? extends T> input) { return input // assign a dummy key and global window, // this is needed to accumulate all observed events in the same state cell .apply(Window.into(new GlobalWindows())) .apply(WithKeys.of("dummyKey")) .apply( "checkAllEventsForSuccess", ParDo.of(new StatefulPredicateCheck<>(coder, formatter, successPredicate))) // signal the success/failure to the result topic .apply("publishSuccess", PubsubIO.writeStrings().to(resultTopicPath.getPath())); } }
@Test @Category(NeedsRunner.class) public void singlePaneSingleReifiedPane() { PCollection<Iterable<ValueInSingleWindow<Iterable<Long>>>> accumulatedPanes = p.apply(GenerateSequence.from(0).to(20000)) .apply(WithTimestamps.of(input -> new Instant(input * 10))) .apply( Window.<Long>into(FixedWindows.of(Duration.standardMinutes(1))) .triggering(AfterWatermark.pastEndOfWindow()) .withAllowedLateness(Duration.ZERO) .discardingFiredPanes()) .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {})) .apply(GroupByKey.create()) .apply(Values.create()) .apply(GatherAllPanes.globally()); PAssert.that(accumulatedPanes) .satisfies( input -> { for (Iterable<ValueInSingleWindow<Iterable<Long>>> windowedInput : input) { if (Iterables.size(windowedInput) > 1) { fail("Expected all windows to have exactly one pane, got " + windowedInput); return null; } } return null; }); p.run(); }
@Override public PCollection<OutputT> expand(PCollection<InputT> input) { PCollection<KV<Void, InputT>> withKeys = input .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), input.getCoder())); Combine.PerKey<Void, InputT, OutputT> combine = Combine.fewKeys(fn, fnDisplayData); if (!sideInputs.isEmpty()) { combine = combine.withSideInputs(sideInputs); } PCollection<KV<Void, OutputT>> combined; if (fanout >= 2) { combined = withKeys.apply(combine.withHotKeyFanout(fanout)); } else { combined = withKeys.apply(combine); } PCollection<OutputT> output = combined.apply(Values.create()); if (insertDefault) { if (!output.getWindowingStrategy().getWindowFn().isCompatible(new GlobalWindows())) { throw new IllegalStateException(fn.getIncompatibleGlobalWindowErrorMessage()); } return insertDefaultValueIfEmpty(output); } else { return output; } }
.withAllowedLateness(Duration.ZERO) .discardingFiredPanes()) .apply(WithKeys.<Void, Long>of((Void) null).withKeyType(new TypeDescriptor<Void>() {})) .apply(GroupByKey.create()) .apply(Values.create())