@Test @Category(ValidatesRunner.class) public void testParDoWithSideInputsIsCumulative() { List<Integer> inputs = Arrays.asList(3, -42, 666); PCollectionView<Integer> sideInput1 = pipeline .apply("CreateSideInput1", Create.of(11)) .apply("ViewSideInput1", View.asSingleton()); PCollectionView<Integer> sideInputUnread = pipeline .apply("CreateSideInputUnread", Create.of(-3333)) .apply("ViewSideInputUnread", View.asSingleton()); PCollectionView<Integer> sideInput2 = pipeline .apply("CreateSideInput2", Create.of(222)) .apply("ViewSideInput2", View.asSingleton()); PCollection<String> output = pipeline .apply(Create.of(inputs)) .apply( ParDo.of(new TestDoFn(Arrays.asList(sideInput1, sideInput2), Arrays.asList())) .withSideInputs(sideInput1) .withSideInputs(sideInputUnread) .withSideInputs(sideInput2)); PAssert.that(output) .satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).andSideInputs(11, 222)); pipeline.run(); }
ParDo.of(new FilterRowDoFn(properties)).withOutputTags(flowOutput, TupleTagList.of(rejectOutput))); ctx.putPCollectionByLinkName(flowLink, outputTuples.get(flowOutput)); ctx.putPCollectionByLinkName(rejectLink, outputTuples.get(rejectOutput));
@Test public void getReplacementTransformGetFn() { DoFn<Integer, Long> originalFn = new ToLongFn(); ParDo.SingleOutput<Integer, Long> originalTransform = ParDo.of(originalFn); PCollection<? extends Integer> input = pipeline.apply(Create.of(1, 2, 3)); AppliedPTransform< PCollection<? extends Integer>, PCollection<Long>, ParDo.SingleOutput<Integer, Long>> application = AppliedPTransform.of( "original", input.expand(), input.apply(originalTransform).expand(), originalTransform, pipeline); PTransformReplacement<PCollection<? extends Integer>, PCollection<Long>> replacementTransform = factory.getReplacementTransform(application); ParDoSingle<Integer, Long> parDoSingle = (ParDoSingle<Integer, Long>) replacementTransform.getTransform(); assertThat(parDoSingle.getFn(), equalTo(originalTransform.getFn())); assertThat(parDoSingle.getFn(), equalTo(originalFn)); }
@Override public PCollectionTuple expand(PCollection<Entity> entities) { return entities.apply( "DetectInvalidEntities", ParDo.of( new DoFn<Entity, Entity>() { private EntityJsonPrinter entityJsonPrinter; @Setup public void setup() { entityJsonPrinter = new EntityJsonPrinter(); } @ProcessElement public void processElement(ProcessContext c) throws IOException { Entity entity = c.element(); if (entity.hasKey()) { c.output(entity); } else { ErrorMessage errorMessage = ErrorMessage.newBuilder() .setMessage("Datastore Entity Without Key") .setData(entityJsonPrinter.print(entity)) .build(); c.output(failureTag(), errorMessage.toJson()); } } }) .withOutputTags(successTag(), TupleTagList.of(failureTag()))); } }
.apply( ParDo.of(new TestDoFn(Arrays.asList(sideInput1, sideInput2), Arrays.asList())) .withSideInputs(sideInput1) .withSideInputs(sideInputUnread) .withSideInputs(sideInput2) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
.apply( ParDo.of(new TestDoFn(Arrays.asList(sideInput1, sideInput2), Arrays.asList())) .withSideInputs(sideInput1) .withSideInputs(sideInputUnread) .withSideInputs(sideInput2) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
public void process(ProcessContext ctxt) {} }) .withOutputTags(new TupleTag<>(), TupleTagList.empty()), Pipeline.create(), components);
@Test @Category(NeedsRunner.class) public void testParDoTaggedOutputWithTimestamp() { PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(3, 42, 6))); final TupleTag<Integer> mainOutputTag = new TupleTag<Integer>("main") {}; final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("additional") {}; PCollection<String> output = input .apply( ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement( @Element Integer element, MultiOutputReceiver r) { r.get(additionalOutputTag) .outputWithTimestamp(element, new Instant(element.longValue())); } }) .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag))) .get(additionalOutputTag) .apply(ParDo.of(new TestShiftTimestampDoFn<>(Duration.ZERO, Duration.ZERO))) .apply(ParDo.of(new TestFormatTimestampDoFn<>())); PAssert.that(output) .containsInAnyOrder( "processing: 3, timestamp: 3", "processing: 42, timestamp: 42", "processing: 6, timestamp: 6"); pipeline.run(); }
@Test public void getReplacementTransformGetSideInputs() { PCollectionView<Long> sideLong = pipeline .apply("LongSideInputVals", Create.of(-1L, -2L, -4L)) .apply("SideLongView", Sum.longsGlobally().asSingletonView()); PCollectionView<List<String>> sideStrings = pipeline .apply("StringSideInputVals", Create.of("foo", "bar", "baz")) .apply("SideStringsView", View.asList()); ParDo.SingleOutput<Integer, Long> originalTransform = ParDo.of(new ToLongFn()).withSideInputs(sideLong, sideStrings); PCollection<? extends Integer> input = pipeline.apply(Create.of(1, 2, 3)); AppliedPTransform< PCollection<? extends Integer>, PCollection<Long>, ParDo.SingleOutput<Integer, Long>> application = AppliedPTransform.of( "original", input.expand(), input.apply(originalTransform).expand(), originalTransform, pipeline); PTransformReplacement<PCollection<? extends Integer>, PCollection<Long>> replacementTransform = factory.getReplacementTransform(application); ParDoSingle<Integer, Long> parDoSingle = (ParDoSingle<Integer, Long>) replacementTransform.getTransform(); assertThat(parDoSingle.getSideInputs(), containsInAnyOrder(sideStrings, sideLong)); }
private PCollectionView<String> createTempFilePrefixView( Pipeline p, final PCollectionView<String> jobIdView) { return p.apply(Create.of("")) .apply( "GetTempFilePrefix", ParDo.of( new DoFn<String, String>() { @ProcessElement public void getTempFilePrefix(ProcessContext c) { String tempLocationRoot; if (customGcsTempLocation != null) { tempLocationRoot = customGcsTempLocation.get(); } else { tempLocationRoot = c.getPipelineOptions().getTempLocation(); } String tempLocation = resolveTempLocation( tempLocationRoot, "BigQueryWriteTemp", c.sideInput(jobIdView)); LOG.info( "Writing BigQuery temporary files to {} before loading them.", tempLocation); c.output(tempLocation); } }) .withSideInputs(jobIdView)) .apply("TempFilePrefixView", View.asSingleton()); }
@Test @Category(ValidatesRunner.class) public void testCombinedMapSideInput() { final PCollectionView<Map<String, Integer>> view = pipeline .apply("CreateSideInput", Create.of(KV.of("a", 1), KV.of("a", 20), KV.of("b", 3))) .apply("SumIntegers", Combine.perKey(Sum.ofIntegers())) .apply(View.asMap()); PCollection<KV<String, Integer>> output = pipeline .apply("CreateMainInput", Create.of("apple", "banana", "blackberry")) .apply( "Output", ParDo.of( new DoFn<String, KV<String, Integer>>() { @ProcessElement public void processElement(ProcessContext c) { c.output( KV.of( c.element(), c.sideInput(view).get(c.element().substring(0, 1)))); } }) .withSideInputs(view)); PAssert.that(output) .containsInAnyOrder(KV.of("apple", 21), KV.of("banana", 3), KV.of("blackberry", 3)); pipeline.run(); }
@Test @Category(ValidatesRunner.class) public void testEmptyMultimapSideInput() throws Exception { final PCollectionView<Map<String, Iterable<Integer>>> view = pipeline .apply( "CreateEmptyView", Create.empty(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()))) .apply(View.asMultimap()); PCollection<Integer> results = pipeline .apply("Create1", Create.of(1)) .apply( "OutputSideInputs", ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement(ProcessContext c) { assertTrue(c.sideInput(view).isEmpty()); assertTrue(c.sideInput(view).entrySet().isEmpty()); assertFalse(c.sideInput(view).entrySet().iterator().hasNext()); c.output(c.element()); } }) .withSideInputs(view)); // Pass at least one value through to guarantee that DoFn executes. PAssert.that(results).containsInAnyOrder(1); pipeline.run(); }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testMapSideInputWithNonDeterministicKeyCoder() { final PCollectionView<Map<String, Integer>> view = pipeline .apply( "CreateSideInput", Create.of(KV.of("a", 1), KV.of("b", 3)) .withCoder(KvCoder.of(new NonDeterministicStringCoder(), VarIntCoder.of()))) .apply(View.asMap()); PCollection<KV<String, Integer>> output = pipeline .apply("CreateMainInput", Create.of("apple", "banana", "blackberry")) .apply( "OutputSideInputs", ParDo.of( new DoFn<String, KV<String, Integer>>() { @ProcessElement public void processElement(ProcessContext c) { c.output( KV.of( c.element(), c.sideInput(view).get(c.element().substring(0, 1)))); } }) .withSideInputs(view)); PAssert.that(output) .containsInAnyOrder(KV.of("apple", 1), KV.of("banana", 3), KV.of("blackberry", 3)); pipeline.run(); }
@Test public void testMultiOutputAppliedMultipleTimesDifferentOutputs() { pipeline.enableAbandonedNodeEnforcement(false); PCollection<Long> longs = pipeline.apply(GenerateSequence.from(0)); TupleTag<Long> mainOut = new TupleTag<>(); final TupleTag<String> valueAsString = new TupleTag<>(); final TupleTag<Integer> valueAsInt = new TupleTag<>(); DoFn<Long, Long> fn = new DoFn<Long, Long>() { @ProcessElement public void processElement(ProcessContext cxt, @Element Long element) { cxt.output(cxt.element()); cxt.output(valueAsString, Long.toString(cxt.element())); cxt.output(valueAsInt, element.intValue()); } }; ParDo.MultiOutput<Long, Long> parDo = ParDo.of(fn).withOutputTags(mainOut, TupleTagList.of(valueAsString).and(valueAsInt)); PCollectionTuple firstApplication = longs.apply("first", parDo); PCollectionTuple secondApplication = longs.apply("second", parDo); assertThat(firstApplication, not(equalTo(secondApplication))); assertThat( firstApplication.getAll().keySet(), Matchers.containsInAnyOrder(mainOut, valueAsString, valueAsInt)); assertThat( secondApplication.getAll().keySet(), Matchers.containsInAnyOrder(mainOut, valueAsString, valueAsInt)); }
private void testWindowedSideInput(IsBounded bounded) { PCollection<Integer> mainInput = p.apply( "main", Create.timestamped( TimestampedValue.of(0, new Instant(0)), TimestampedValue.of(1, new Instant(1)), TimestampedValue.of(2, new Instant(2)), TimestampedValue.of(3, new Instant(3)), TimestampedValue.of(4, new Instant(4)), TimestampedValue.of(5, new Instant(5)), TimestampedValue.of(6, new Instant(6)), TimestampedValue.of(7, new Instant(7)))) .apply("window 2", Window.into(FixedWindows.of(Duration.millis(2)))); PCollectionView<String> sideInput = p.apply( "side", Create.timestamped( TimestampedValue.of("a", new Instant(0)), TimestampedValue.of("b", new Instant(4)))) .apply("window 4", Window.into(FixedWindows.of(Duration.millis(4)))) .apply("singleton", View.asSingleton()); PCollection<String> res = mainInput.apply(ParDo.of(sdfWithSideInput(bounded, sideInput)).withSideInputs(sideInput)); PAssert.that(res).containsInAnyOrder("a:0", "a:1", "a:2", "a:3", "b:4", "b:5", "b:6", "b:7"); p.run(); }
@Test @Category(ValidatesRunner.class) public void testMapSideInput() { final PCollectionView<Map<String, Integer>> view = pipeline .apply("CreateSideInput", Create.of(KV.of("a", 1), KV.of("b", 3))) .apply(View.asMap()); PCollection<KV<String, Integer>> output = pipeline .apply("CreateMainInput", Create.of("apple", "banana", "blackberry")) .apply( "OutputSideInputs", ParDo.of( new DoFn<String, KV<String, Integer>>() { @ProcessElement public void processElement(ProcessContext c) { c.output( KV.of( c.element(), c.sideInput(view).get(c.element().substring(0, 1)))); } }) .withSideInputs(view)); PAssert.that(output) .containsInAnyOrder(KV.of("apple", 1), KV.of("banana", 3), KV.of("blackberry", 3)); pipeline.run(); }
@Override public PCollection<KV<TableDestination, String>> expand( PCollection<KV<ShardedKey<DestinationT>, List<String>>> input) { PCollectionTuple writeTablesOutputs = input.apply( ParDo.of(new WriteTablesDoFn()) .withSideInputs(sideInputs) .withOutputTags(mainOutputTag, TupleTagList.of(temporaryFilesTag))); // Garbage collect temporary files. // We mustn't start garbage collecting files until we are assured that the WriteTablesDoFn has // succeeded in loading those files and won't be retried. Otherwise, we might fail part of the // way through deleting temporary files, and retry WriteTablesDoFn. This will then fail due // to missing files, causing either the entire workflow to fail or get stuck (depending on how // the runner handles persistent failures). writeTablesOutputs .get(temporaryFilesTag) .setCoder(StringUtf8Coder.of()) .apply(WithKeys.of((Void) null)) .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of())) .apply( Window.<KV<Void, String>>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) .discardingFiredPanes()) .apply(GroupByKey.create()) .apply(Values.create()) .apply(ParDo.of(new GarbageCollectTemporaryFiles())); return writeTablesOutputs.get(mainOutputTag); }
@Test @Category(ValidatesRunner.class) public void testFlattenMultiplePCollectionsHavingMultipleConsumers() { PCollection<String> input = p.apply(Create.of("AA", "BBB", "CC")); final TupleTag<String> outputEvenLengthTag = new TupleTag<String>() {}; final TupleTag<String> outputOddLengthTag = new TupleTag<String>() {}; PCollectionTuple tuple = input.apply( ParDo.of( new DoFn<String, String>() { @ProcessElement public void processElement(ProcessContext c) { if (c.element().length() % 2 == 0) { c.output(c.element()); } else { c.output(outputOddLengthTag, c.element()); } } }) .withOutputTags(outputEvenLengthTag, TupleTagList.of(outputOddLengthTag))); PCollection<String> outputEvenLength = tuple.get(outputEvenLengthTag); PCollection<String> outputOddLength = tuple.get(outputOddLengthTag); PCollection<String> outputMerged = PCollectionList.of(outputEvenLength).and(outputOddLength).apply(Flatten.pCollections()); PAssert.that(outputMerged).containsInAnyOrder("AA", "BBB", "CC"); PAssert.that(outputEvenLength).containsInAnyOrder("AA", "CC"); PAssert.that(outputOddLength).containsInAnyOrder("BBB"); p.run(); }
@Override public PCollectionTuple expand(PCollection<FailsafeElement<T, String>> failsafeElements) { return failsafeElements.apply( "JsonToTableRow", ParDo.of( new DoFn<FailsafeElement<T, String>, TableRow>() { @ProcessElement public void processElement(ProcessContext context) { FailsafeElement<T, String> element = context.element(); String json = element.getPayload(); try { TableRow row = convertJsonToTableRow(json); context.output(row); } catch (Exception e) { context.output( failureTag(), FailsafeElement.of(element) .setErrorMessage(e.getMessage()) .setStacktrace(Throwables.getStackTraceAsString(e))); } } }) .withOutputTags(successTag(), TupleTagList.of(failureTag()))); } }
@Test @Category(NeedsRunner.class) public void testEmptySingletonSideInput() throws Exception { final PCollectionView<Integer> view = pipeline .apply("CreateEmptyIntegers", Create.empty(VarIntCoder.of())) .apply(View.asSingleton()); pipeline .apply("Create123", Create.of(1, 2, 3)) .apply( "OutputSideInputs", ParDo.of( new DoFn<Integer, Integer>() { @ProcessElement public void processElement(ProcessContext c) { c.output(c.sideInput(view)); } }) .withSideInputs(view)); thrown.expect(PipelineExecutionException.class); thrown.expectCause(isA(NoSuchElementException.class)); thrown.expectMessage("Empty"); thrown.expectMessage("PCollection"); thrown.expectMessage("singleton"); pipeline.run(); }