/** Indicates whether the given pipeline has any unbounded PCollections. */ private static boolean hasUnboundedPCollections(RunnerApi.Pipeline pipeline) { checkNotNull(pipeline); Collection<RunnerApi.PCollection> pCollecctions = pipeline.getComponents().getPcollectionsMap().values(); // Assume that all PCollections are consumed at some point in the pipeline. return pCollecctions .stream() .anyMatch(pc -> pc.getIsBounded() == RunnerApi.IsBounded.Enum.UNBOUNDED); } }
/** Indicates whether the given pipeline has any unbounded PCollections. */ private static boolean hasUnboundedPCollections(RunnerApi.Pipeline pipeline) { checkNotNull(pipeline); Collection<RunnerApi.PCollection> pCollecctions = pipeline.getComponents().getPcollectionsMap().values(); // Assume that all PCollections are consumed at some point in the pipeline. return pCollecctions .stream() .anyMatch(pc -> pc.getIsBounded() == RunnerApi.IsBounded.Enum.UNBOUNDED); } }
private void urnNotFound( String id, RunnerApi.Pipeline pipeline, FlinkStreamingPortablePipelineTranslator.TranslationContext context) { throw new IllegalArgumentException( String.format( "Unknown type of URN %s for PTransform with id %s.", pipeline.getComponents().getTransformsOrThrow(id).getSpec().getUrn(), id)); }
@Override public FlinkPortablePipelineTranslator.Executor translate( BatchTranslationContext context, RunnerApi.Pipeline pipeline) { // Use a QueryablePipeline to traverse transforms topologically. QueryablePipeline p = QueryablePipeline.forTransforms( pipeline.getRootTransformIdsList(), pipeline.getComponents()); for (PipelineNode.PTransformNode transform : p.getTopologicallyOrderedTransforms()) { urnToTransformTranslator .getOrDefault( transform.getTransform().getSpec().getUrn(), FlinkBatchPortablePipelineTranslator::urnNotFound) .translate(transform, pipeline, context); } // Ensure that side effects are performed for unconsumed DataSets. for (DataSet<?> dataSet : context.getDanglingDataSets()) { dataSet.output(new DiscardingOutputFormat<>()).name("DiscardingOutput"); } return context; }
@Test public void rootTransforms() { Pipeline p = Pipeline.create(); p.apply("UnboundedRead", Read.from(CountingSource.unbounded())) .apply(Window.into(FixedWindows.of(Duration.millis(5L)))) .apply(Count.perElement()); p.apply("BoundedRead", Read.from(CountingSource.upTo(100L))); Components components = PipelineTranslation.toProto(p).getComponents(); QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components); assertThat(qp.getRootTransforms(), hasSize(2)); for (PTransformNode rootTransform : qp.getRootTransforms()) { assertThat( "Root transforms should have no inputs", rootTransform.getTransform().getInputsCount(), equalTo(0)); assertThat( "Only added source reads to the pipeline", rootTransform.getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.READ_TRANSFORM_URN)); } }
@Override public void translate(BatchTranslationContext context, RunnerApi.Pipeline pipeline) { // Use a QueryablePipeline to traverse transforms topologically. QueryablePipeline p = QueryablePipeline.forTransforms( pipeline.getRootTransformIdsList(), pipeline.getComponents()); for (PipelineNode.PTransformNode transform : p.getTopologicallyOrderedTransforms()) { urnToTransformTranslator .getOrDefault( transform.getTransform().getSpec().getUrn(), FlinkBatchPortablePipelineTranslator::urnNotFound) .translate(transform, pipeline, context); } // Ensure that side effects are performed for unconsumed DataSets. for (DataSet<?> dataSet : context.getDanglingDataSets()) { dataSet.output(new DiscardingOutputFormat<>()); } }
@Test public void getEnvironmentWithEnvironment() { Pipeline p = Pipeline.create(); PCollection<Long> longs = p.apply("BoundedRead", Read.from(CountingSource.upTo(100L))); PCollectionList.of(longs).and(longs).and(longs).apply("flatten", Flatten.pCollections()); Components components = PipelineTranslation.toProto(p).getComponents(); QueryablePipeline qp = QueryablePipeline.forPrimitivesIn(components); PTransformNode environmentalRead = PipelineNode.pTransform("BoundedRead", components.getTransformsOrThrow("BoundedRead")); PTransformNode nonEnvironmentalTransform = PipelineNode.pTransform("flatten", components.getTransformsOrThrow("flatten")); assertThat(qp.getEnvironment(environmentalRead).isPresent(), is(true)); assertThat( qp.getEnvironment(environmentalRead).get(), equalTo(Environments.JAVA_SDK_HARNESS_ENVIRONMENT)); assertThat(qp.getEnvironment(nonEnvironmentalTransform).isPresent(), is(false)); }
private GreedyPipelineFuser(Pipeline p) { // Validate that the original pipeline is well-formed. PipelineValidator.validate(p); this.pipeline = QueryablePipeline.forPrimitivesIn(p.getComponents()); Set<PTransformNode> unfusedRootNodes = new LinkedHashSet<>(); NavigableSet<CollectionConsumer> rootConsumers = new TreeSet<>(); for (PTransformNode pTransformNode : pipeline.getRootTransforms()) { // This will usually be a single node, the downstream of an Impulse, but may be of any size DescendantConsumers descendants = getRootConsumers(pTransformNode); unfusedRootNodes.addAll(descendants.getUnfusedNodes()); rootConsumers.addAll(descendants.getFusibleConsumers()); } this.fusedPipeline = fusePipeline(unfusedRootNodes, groupSiblings(rootConsumers)); }
private PortableGraph(RunnerApi.Pipeline p) { this.queryablePipeline = QueryablePipeline.forTransforms(p.getRootTransformIdsList(), p.getComponents()); }
private void translateImpulse( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id); TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); boolean keepSourceAlive = !context.getPipelineOptions().isShutdownSourcesOnFinalWatermark(); SingleOutputStreamOperator<WindowedValue<byte[]>> source = context .getExecutionEnvironment() .addSource(new ImpulseSourceFunction(keepSourceAlive), "Impulse") .returns(typeInfo); context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source); }
private void urnNotFound( String id, RunnerApi.Pipeline pipeline, FlinkStreamingPortablePipelineTranslator.TranslationContext context) { throw new IllegalArgumentException( String.format( "Unknown type of URN %s for PTransform with id %s.", pipeline.getComponents().getTransformsOrThrow(id).getSpec().getUrn(), id)); }
private <K, V> void translateReshuffle( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id); DataStream<WindowedValue<KV<K, V>>> inputDataStream = context.getDataStreamOrThrow(Iterables.getOnlyElement(transform.getInputsMap().values())); context.addDataStream( Iterables.getOnlyElement(transform.getOutputsMap().values()), inputDataStream.rebalance()); }
private void translateImpulse( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform pTransform = pipeline.getComponents().getTransformsOrThrow(id); TypeInformation<WindowedValue<byte[]>> typeInfo = new CoderTypeInformation<>( WindowedValue.getFullCoder(ByteArrayCoder.of(), GlobalWindow.Coder.INSTANCE)); boolean keepSourceAlive = !context.getPipelineOptions().isShutdownSourcesOnFinalWatermark(); SingleOutputStreamOperator<WindowedValue<byte[]>> source = context .getExecutionEnvironment() .addSource(new ImpulseSourceFunction(keepSourceAlive)) .returns(typeInfo); context.addDataStream(Iterables.getOnlyElement(pTransform.getOutputsMap().values()), source); }
public static void validate(RunnerApi.Pipeline p) { Components components = p.getComponents(); for (String transformId : p.getRootTransformIdsList()) { checkArgument( components.containsTransforms(transformId), "Root transform id %s is unknown", transformId); } validateComponents("pipeline", components); }
@Override public FlinkPortablePipelineTranslator.Executor translate( StreamingTranslationContext context, RunnerApi.Pipeline pipeline) { QueryablePipeline p = QueryablePipeline.forTransforms( pipeline.getRootTransformIdsList(), pipeline.getComponents()); for (PipelineNode.PTransformNode transform : p.getTopologicallyOrderedTransforms()) { urnToTransformTranslator .getOrDefault(transform.getTransform().getSpec().getUrn(), this::urnNotFound) .translate(transform.getId(), pipeline, context); } return context; }
private RunnerApi.Pipeline makeKnownUrnsPrimitives( RunnerApi.Pipeline pipeline, Set<String> knownUrns) { RunnerApi.Pipeline.Builder trimmedPipeline = pipeline.toBuilder(); for (String ptransformId : pipeline.getComponents().getTransformsMap().keySet()) { if (knownUrns.contains( pipeline.getComponents().getTransformsOrThrow(ptransformId).getSpec().getUrn())) { LOG.debug("Removing descendants of known PTransform {}" + ptransformId); removeDescendants(trimmedPipeline, ptransformId); } } return trimmedPipeline.build(); }
@Override public void translate(StreamingTranslationContext context, RunnerApi.Pipeline pipeline) { QueryablePipeline p = QueryablePipeline.forTransforms( pipeline.getRootTransformIdsList(), pipeline.getComponents()); for (PipelineNode.PTransformNode transform : p.getTopologicallyOrderedTransforms()) { urnToTransformTranslator .getOrDefault(transform.getTransform().getSpec().getUrn(), this::urnNotFound) .translate(transform.getId(), pipeline, context); } }
/** * Create a new {@link QueryablePipeline} which uses the root transform IDs and components of the * provided {@link Pipeline}. */ public static QueryablePipeline forPipeline(RunnerApi.Pipeline p) { return forTransforms(p.getRootTransformIdsList(), p.getComponents()); }
@Test public void retainOnlyPrimitivesWithOnlyPrimitivesUnchanged() { Pipeline p = Pipeline.create(); p.apply("Read", Read.from(CountingSource.unbounded())) .apply( "multi-do", ParDo.of(new TestFn()).withOutputTags(new TupleTag<>(), TupleTagList.empty())); Components originalComponents = PipelineTranslation.toProto(p).getComponents(); Collection<String> primitiveComponents = QueryablePipeline.getPrimitiveTransformIds(originalComponents); assertThat(primitiveComponents, equalTo(originalComponents.getTransformsMap().keySet())); }
private <K, V> void translateReshuffle( String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) { RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id); DataStream<WindowedValue<KV<K, V>>> inputDataStream = context.getDataStreamOrThrow(Iterables.getOnlyElement(transform.getInputsMap().values())); context.addDataStream( Iterables.getOnlyElement(transform.getOutputsMap().values()), inputDataStream.rebalance()); }