/** * {@inheritDoc}. * * <p>The input {@link PCollectionList} that is constructed will have the same values in the same */ private PCollectionList<T> getInput(Map<TupleTag<?>, PValue> inputs, Pipeline p) { PCollectionList<T> pCollections = PCollectionList.empty(p); for (PValue input : inputs.values()) { PCollection<T> pcollection = (PCollection<T>) input; pCollections = pCollections.and(pcollection); } return pCollections; }
/** Transforms the inputs into a PInput. */ private static PCollectionList<Row> buildPCollectionList( List<RelNode> inputRels, Pipeline pipeline, Map<Integer, PCollection<Row>> cache) { if (inputRels.isEmpty()) { return PCollectionList.empty(pipeline); } else { return PCollectionList.of( inputRels .stream() .map(input -> BeamSqlRelUtils.toPCollection(pipeline, (BeamRelNode) input, cache)) .collect(Collectors.toList())); } }
@Override public PTransformReplacement<PCollectionList<T>, PCollection<T>> getReplacementTransform( AppliedPTransform<PCollectionList<T>, PCollection<T>, PCollections<T>> transform) { checkArgument( transform.getInputs().isEmpty(), "Unexpected nonempty input %s for %s", transform.getInputs(), getClass().getSimpleName()); return PTransformReplacement.of( PCollectionList.empty(transform.getPipeline()), new CreateEmptyFromList<T>()); }
@Override public PCollection<T> expand(PCollectionList<T> input) { Map<PCollection<T>, Integer> instances = new HashMap<>(); for (PCollection<T> pCollection : input.getAll()) { int existing = instances.get(pCollection) == null ? 0 : instances.get(pCollection); instances.put(pCollection, existing + 1); } PCollectionList<T> output = PCollectionList.empty(input.getPipeline()); for (Map.Entry<PCollection<T>, Integer> instanceEntry : instances.entrySet()) { if (instanceEntry.getValue().equals(1)) { output = output.and(instanceEntry.getKey()); } else { String duplicationName = String.format("Multiply %s", instanceEntry.getKey().getName()); PCollection<T> duplicated = instanceEntry .getKey() .apply(duplicationName, ParDo.of(new DuplicateFn<>(instanceEntry.getValue()))); output = output.and(duplicated); } } return output.apply(Flatten.pCollections()); } }
@Override public PCollection<KV<URI, String>> expand(PBegin input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if (uri.getScheme().equals("file")) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply("TextIO.Read(" + uriString + ")", TextIO.Read.from(uriString)) .apply("WithKeys(" + uriString + ")", WithKeys.<URI, String>of(uri)); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.<KV<URI, String>>pCollections()); } }
@Override public PCollection<KV<URI, String>> expand(PBegin input) { Pipeline pipeline = input.getPipeline(); // Create one TextIO.Read transform for each document // and add its output to a PCollectionList PCollectionList<KV<URI, String>> urisToLines = PCollectionList.empty(pipeline); // TextIO.Read supports: // - file: URIs and paths locally // - gs: URIs on the service for (final URI uri : uris) { String uriString; if ("file".equals(uri.getScheme())) { uriString = new File(uri).getPath(); } else { uriString = uri.toString(); } PCollection<KV<URI, String>> oneUriToLines = pipeline .apply("TextIO.Read(" + uriString + ")", TextIO.read().from(uriString)) .apply("WithKeys(" + uriString + ")", WithKeys.of(uri)) .setCoder(KvCoder.of(StringDelegateCoder.of(URI.class), StringUtf8Coder.of())); urisToLines = urisToLines.and(oneUriToLines); } return urisToLines.apply(Flatten.pCollections()); } }
@Override public PCollection<T> expand(PBegin input) { PCollectionList<T> empty = PCollectionList.empty(input.getPipeline()); return empty.apply(Flatten.pCollections()); } }
@Override public PCollectionList<T> expand(PCollection<T> in) { final TupleTagList outputTags = partitionDoFn.getOutputTags(); PCollectionTuple outputs = in.apply(ParDo.of(partitionDoFn).withOutputTags(new TupleTag<Void>() {}, outputTags)); PCollectionList<T> pcs = PCollectionList.empty(in.getPipeline()); Coder<T> coder = in.getCoder(); for (TupleTag<?> outputTag : outputTags.getAll()) { // All the tuple tags are actually TupleTag<T> // And all the collections are actually PCollection<T> @SuppressWarnings("unchecked") TupleTag<T> typedOutputTag = (TupleTag<T>) outputTag; pcs = pcs.and(outputs.get(typedOutputTag).setCoder(coder)); } return pcs; }
@Test @Category(NeedsRunner.class) public void testOverride() { PCollectionList<Long> empty = PCollectionList.empty(pipeline); PCollection<Long> emptyFlattened = empty.apply( factory .getReplacementTransform( AppliedPTransform.of( "nonEmptyInput", Collections.emptyMap(), Collections.emptyMap(), Flatten.pCollections(), pipeline)) .getTransform()); PAssert.that(emptyFlattened).empty(); pipeline.run(); } }
@Test @Category(NeedsRunner.class) public void testFlattenNoListsNoCoder() { // not ValidatesRunner because it should fail at pipeline construction time anyhow. thrown.expect(IllegalStateException.class); thrown.expectMessage("Unable to return a default Coder"); PCollectionList.<ClassWithoutCoder>empty(p).apply(Flatten.pCollections()); p.run(); }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testUnboundedSourceSplits() throws Exception { long numElements = 1000; int numSplits = 10; UnboundedSource<Long, ?> initial = CountingSource.unbounded(); List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
@Test public void getRootTransformsContainsEmptyFlatten() { PCollections<String> flatten = Flatten.pCollections(); PCollectionList<String> emptyList = PCollectionList.empty(p); PCollection<String> empty = emptyList.apply(flatten); empty.setCoder(StringUtf8Coder.of()); p.traverseTopologically(visitor); DirectGraph graph = visitor.getGraph(); assertThat( graph.getRootTransforms(), Matchers.containsInAnyOrder(new Object[] {graph.getProducer(empty)})); AppliedPTransform<?, ?, ?> onlyRoot = Iterables.getOnlyElement(graph.getRootTransforms()); assertThat((Object) onlyRoot.getTransform(), equalTo(flatten)); assertThat(onlyRoot.getInputs().entrySet(), emptyIterable()); assertThat(onlyRoot.getOutputs(), equalTo(empty.expand())); }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testBoundedSourceSplits() throws Exception { long numElements = 1000; long numSplits = 10; long splitSizeBytes = numElements * 8 / numSplits; // 8 bytes per long element. BoundedSource<Long> initial = CountingSource.upTo(numElements); List<? extends BoundedSource<Long>> splits = initial.split(splitSizeBytes, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); // Assemble all the splits into one flattened PCollection, also verify their sizes. PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { BoundedSource<Long> split = splits.get(i); pcollections = pcollections.and(p.apply("split" + i, Read.from(split))); assertEquals( "Expected even splitting", splitSizeBytes, split.getEstimatedSizeBytes(p.getOptions())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
@Test @Category(ValidatesRunner.class) public void testFlattenPCollectionsEmpty() { PCollection<String> output = PCollectionList.<String>empty(p) .apply(Flatten.pCollections()) .setCoder(StringUtf8Coder.of()); PAssert.that(output).empty(); p.run(); }
@Test public void testFlattenInMemoryEvaluatorWithEmptyPCollectionList() throws Exception { PCollectionList<Integer> list = PCollectionList.empty(p); PCollection<Integer> flattened = list.apply(Flatten.pCollections()); flattened.setCoder(VarIntCoder.of()); EvaluationContext evaluationContext = mock(EvaluationContext.class); when(evaluationContext.createBundle(flattened)) .thenReturn(bundleFactory.createBundle(flattened)); FlattenEvaluatorFactory factory = new FlattenEvaluatorFactory(evaluationContext); AppliedPTransform<?, ?, ?> flattendProducer = DirectGraphs.getProducer(flattened); TransformEvaluator<Integer> emptyEvaluator = factory.forApplication( flattendProducer, bundleFactory.createRootBundle().commit(BoundedWindow.TIMESTAMP_MAX_VALUE)); TransformResult<Integer> leftSideResult = emptyEvaluator.finishBundle(); CommittedBundle<?> outputBundle = Iterables.getOnlyElement(leftSideResult.getOutputBundles()).commit(Instant.now()); assertThat(outputBundle.getElements(), emptyIterable()); assertThat( leftSideResult.getTransform(), Matchers.<AppliedPTransform<?, ?, ?>>equalTo(flattendProducer)); } }
@Test @Category(ValidatesRunner.class) public void testFlattenPCollectionsEmptyThenParDo() { PCollection<String> output = PCollectionList.<String>empty(p) .apply(Flatten.pCollections()) .setCoder(StringUtf8Coder.of()) .apply(ParDo.of(new IdentityFn<>())); PAssert.that(output).empty(); p.run(); }
@Test @Category(NeedsRunner.class) public void testUnboundedSourceRateSplits() throws Exception { int elementsPerPeriod = 10; Duration period = Duration.millis(5); long numElements = 1000; int numSplits = 10; UnboundedCountingSource initial = CountingSource.createUnboundedFrom(0).withRate(elementsPerPeriod, period); List<? extends UnboundedSource<Long, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); Instant startTime = Instant.now(); p.run(); Instant endTime = Instant.now(); // 500 ms if the readers are all initialized in parallel; 5000 ms if they are evaluated serially long expectedMinimumMillis = (numElements * period.getMillis()) / elementsPerPeriod; assertThat(expectedMinimumMillis, lessThan(endTime.getMillis() - startTime.getMillis())); }
@Test public void testUnboundedSourceSplits() throws Exception { int numElements = 1000; int numSplits = 10; // Coders must be specified explicitly here due to the way the transform // is used in the test. UnboundedSource<KafkaRecord<Integer, Long>, ?> initial = mkKafkaReadTransform(numElements, null) .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of()) .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of()) .makeSource(); List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits = initial.split(numSplits, p.getOptions()); assertEquals("Expected exact splitting", numSplits, splits.size()); long elementsPerSplit = numElements / numSplits; assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits); PCollectionList<Long> pcollections = PCollectionList.empty(p); for (int i = 0; i < splits.size(); ++i) { pcollections = pcollections.and( p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit)) .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>())) .apply("collection " + i, Values.create())); } PCollection<Long> input = pcollections.apply(Flatten.pCollections()); addCountingAsserts(input, numElements); p.run(); }
@Test @Category(ValidatesRunner.class) public void testEmptyFlattenAsSideInput() { final PCollectionView<Iterable<String>> view = PCollectionList.<String>empty(p) .apply(Flatten.pCollections()) .setCoder(StringUtf8Coder.of()) .apply(View.asIterable()); PCollection<String> output = p.apply(Create.of((Void) null).withCoder(VoidCoder.of())) .apply( ParDo.of( new DoFn<Void, String>() { @ProcessElement public void processElement(ProcessContext c) { for (String side : c.sideInput(view)) { c.output(side); } } }) .withSideInputs(view)); PAssert.that(output).empty(); p.run(); }
@Test public void testEquals() { Pipeline p = TestPipeline.create(); PCollection<String> first = p.apply("Meta", Create.of("foo", "bar")); PCollection<String> second = p.apply("Pythonic", Create.of("spam, ham")); PCollection<String> third = p.apply("Syntactic", Create.of("eggs", "baz")); EqualsTester tester = new EqualsTester(); // tester.addEqualityGroup(PCollectionList.empty(p), PCollectionList.empty(p)); // tester.addEqualityGroup(PCollectionList.of(first).and(second)); // Constructors should all produce equivalent tester.addEqualityGroup( PCollectionList.of(first).and(second).and(third), PCollectionList.of(first).and(second).and(third), // PCollectionList.<String>empty(p).and(first).and(second).and(third), // PCollectionList.of(ImmutableList.of(first, second, third)), // PCollectionList.of(first).and(ImmutableList.of(second, third)), PCollectionList.of(ImmutableList.of(first, second)).and(third)); // Order is considered tester.addEqualityGroup(PCollectionList.of(first).and(third).and(second)); tester.addEqualityGroup(PCollectionList.empty(TestPipeline.create())); tester.testEquals(); } }