byte[] serializedProtoPipeline = jobSpecification.getPipelineProto().toByteArray(); DataflowPackage stagedPipeline = options.getStager().stageToFile(serializedProtoPipeline, PIPELINE_FILE_NAME); Job newJob = jobSpecification.getJob(); try { newJob jobResult.getId(), options, jobSpecification.getStepNames());
/** Translates a {@link Pipeline} into a {@code JobSpecification}. */ public JobSpecification translate( Pipeline pipeline, DataflowRunner runner, List<DataflowPackage> packages) { // Capture the sdkComponents for look up during step translations SdkComponents sdkComponents = SdkComponents.create(); sdkComponents.registerEnvironment(Environments.JAVA_SDK_HARNESS_ENVIRONMENT); RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true); LOG.debug("Portable pipeline proto:\n{}", TextFormat.printToString(pipelineProto)); Translator translator = new Translator(pipeline, runner, sdkComponents); Job result = translator.translate(packages); return new JobSpecification( result, pipelineProto, Collections.unmodifiableMap(translator.stepNames)); }
Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob(); assertAllStepOutputsHaveUniqueIds(job);
Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob();
Job job = result.getJob(); Components componentsProto = result.getPipelineProto().getComponents(); RehydratedComponents components = RehydratedComponents.forComponents(componentsProto); RunnerApi.PTransform spkTransform = componentsProto.getTransformsOrThrow(fn);
Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob();
Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob();
/** This tests a few corner cases that should not crash. */ @Test public void testGoodWildcards() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline pipeline = Pipeline.create(options); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(options); applyRead(pipeline, "gs://bucket/foo"); applyRead(pipeline, "gs://bucket/foo/"); applyRead(pipeline, "gs://bucket/foo/*"); applyRead(pipeline, "gs://bucket/foo/?"); applyRead(pipeline, "gs://bucket/foo/[0-9]"); applyRead(pipeline, "gs://bucket/foo/*baz*"); applyRead(pipeline, "gs://bucket/foo/*baz?"); applyRead(pipeline, "gs://bucket/foo/[0-9]baz?"); applyRead(pipeline, "gs://bucket/foo/baz/*"); applyRead(pipeline, "gs://bucket/foo/baz/*wonka*"); applyRead(pipeline, "gs://bucket/foo/*baz/wonka*"); applyRead(pipeline, "gs://bucket/foo*/baz"); applyRead(pipeline, "gs://bucket/foo?/baz"); applyRead(pipeline, "gs://bucket/foo[0-9]/baz"); // Check that translation doesn't fail. JobSpecification jobSpecification = t.translate(pipeline, DataflowRunner.fromOptions(options), Collections.emptyList()); assertAllStepOutputsHaveUniqueIds(jobSpecification.getJob()); }
@Test public void testMaxNumWorkersIsPassedWhenNoAlgorithmIsSet() throws IOException { final DataflowPipelineWorkerPoolOptions.AutoscalingAlgorithmType noScaling = null; DataflowPipelineOptions options = buildPipelineOptions(); options.setMaxNumWorkers(42); options.setAutoscalingAlgorithm(noScaling); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertNull( job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm()); assertEquals( 42, job.getEnvironment() .getWorkerPools() .get(0) .getAutoscalingSettings() .getMaxNumWorkers() .intValue()); }
/** * Test that in translation the name for a collection (in this case just a Create output) is * overridden to be what the Dataflow service expects. */ @Test public void testNamesOverridden() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); DataflowRunner runner = DataflowRunner.fromOptions(options); options.setStreaming(false); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply("Jazzy", Create.of(3)).setName("foobizzle"); runner.replaceTransforms(pipeline); Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob(); // The Create step Step step = job.getSteps().get(0); // This is the name that is "set by the user" that the Dataflow translator must override String userSpecifiedName = getString( Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null).get(0), PropertyNames.USER_NAME); // This is the calculated name that must actually be used String calculatedName = getString(step.getProperties(), PropertyNames.USER_NAME) + ".out0"; assertThat(userSpecifiedName, equalTo(calculatedName)); }
@Test public void testToSingletonTranslationWithIsmSideInput() throws Exception { // A "change detector" test that makes sure the translation // of getting a PCollectionView<T> does not change // in bad ways during refactor DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(Create.of(1)).apply(View.asSingleton()); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob(); assertAllStepOutputsHaveUniqueIds(job); List<Step> steps = job.getSteps(); assertEquals(9, steps.size()); @SuppressWarnings("unchecked") List<Map<String, Object>> toIsmRecordOutputs = (List<Map<String, Object>>) steps.get(7).getProperties().get(PropertyNames.OUTPUT_INFO); assertTrue( Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format")); Step collectionToSingletonStep = steps.get(8); assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind()); }
@Test public void testScalingAlgorithmNone() throws IOException { final DataflowPipelineWorkerPoolOptions.AutoscalingAlgorithmType noScaling = DataflowPipelineWorkerPoolOptions.AutoscalingAlgorithmType.NONE; DataflowPipelineOptions options = buildPipelineOptions(); options.setAutoscalingAlgorithm(noScaling); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertEquals( "AUTOSCALING_ALGORITHM_NONE", job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm()); assertEquals( 0, job.getEnvironment() .getWorkerPools() .get(0) .getAutoscalingSettings() .getMaxNumWorkers() .intValue()); }
@Test public void testScalingAlgorithmMissing() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); // Autoscaling settings are always set. assertNull( job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm()); assertEquals( 0, job.getEnvironment() .getWorkerPools() .get(0) .getAutoscalingSettings() .getMaxNumWorkers() .intValue()); }
@Test public void testToIterableTranslationWithIsmSideInput() throws Exception { // A "change detector" test that makes sure the translation // of getting a PCollectionView<Iterable<T>> does not change // in bad ways during refactor DataflowPipelineOptions options = buildPipelineOptions(); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply(Create.of(1, 2, 3)).apply(View.asIterable()); DataflowRunner runner = DataflowRunner.fromOptions(options); runner.replaceTransforms(pipeline); Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob(); assertAllStepOutputsHaveUniqueIds(job); List<Step> steps = job.getSteps(); assertEquals(3, steps.size()); @SuppressWarnings("unchecked") List<Map<String, Object>> toIsmRecordOutputs = (List<Map<String, Object>>) steps.get(1).getProperties().get(PropertyNames.OUTPUT_INFO); assertTrue( Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format")); Step collectionToSingletonStep = steps.get(2); assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind()); }
@Test public void testMultiGraphPipelineSerialization() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline p = Pipeline.create(options); PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3)); input.apply(new UnrelatedOutputCreator()); input.apply(new UnboundOutputCreator()); DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions( PipelineOptionsFactory.as(DataflowPipelineOptions.class)); // Check that translation doesn't fail. JobSpecification jobSpecification = t.translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()); assertAllStepOutputsHaveUniqueIds(jobSpecification.getJob()); }
@Test public void testWorkerMachineTypeConfig() throws IOException { final String testMachineType = "test-machine-type"; DataflowPipelineOptions options = buildPipelineOptions(); options.setWorkerMachineType(testMachineType); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); WorkerPool workerPool = job.getEnvironment().getWorkerPools().get(0); assertEquals(testMachineType, workerPool.getMachineType()); }
@Test public void testNetworkConfig() throws IOException { final String testNetwork = "test-network"; DataflowPipelineOptions options = buildPipelineOptions(); options.setNetwork(testNetwork); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertEquals(testNetwork, job.getEnvironment().getWorkerPools().get(0).getNetwork()); }
@Test public void testDiskSizeGbConfig() throws IOException { final Integer diskSizeGb = 1234; DataflowPipelineOptions options = buildPipelineOptions(); options.setDiskSizeGb(diskSizeGb); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertEquals(diskSizeGb, job.getEnvironment().getWorkerPools().get(0).getDiskSizeGb()); }
@Test public void testZoneConfig() throws IOException { final String testZone = "test-zone-1"; DataflowPipelineOptions options = buildPipelineOptions(); options.setZone(testZone); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertEquals(testZone, job.getEnvironment().getWorkerPools().get(0).getZone()); }
@Test public void testNetworkConfigMissing() throws IOException { DataflowPipelineOptions options = buildPipelineOptions(); Pipeline p = buildPipeline(options); p.traverseTopologically(new RecordingPipelineVisitor()); Job job = DataflowPipelineTranslator.fromOptions(options) .translate(p, DataflowRunner.fromOptions(options), Collections.emptyList()) .getJob(); assertEquals(1, job.getEnvironment().getWorkerPools().size()); assertNull(job.getEnvironment().getWorkerPools().get(0).getNetwork()); }