if (dataflowOptions.getAppName() == null) { missing.add("appName"); PathValidator validator = dataflowOptions.getPathValidator(); String gcpTempLocation; try { gcpTempLocation = dataflowOptions.getGcpTempLocation(); } catch (Exception e) { throw new IllegalArgumentException( stagingLocation = dataflowOptions.getStagingLocation(); } catch (Exception e) { throw new IllegalArgumentException( if (!isNullOrEmpty(dataflowOptions.getSaveProfilesToGcs())) { validator.validateOutputFilePrefixSupported(dataflowOptions.getSaveProfilesToGcs()); if (dataflowOptions.getFilesToStage() == null) { dataflowOptions.setFilesToStage( detectClassPathResourcesToStage(DataflowRunner.class.getClassLoader())); if (dataflowOptions.getFilesToStage().isEmpty()) { throw new IllegalArgumentException("No files to stage has been found."); } else { + "Defaulting to files from the classpath: will stage {} files. " + "Enable logging at DEBUG level to see which files will be staged.", dataflowOptions.getFilesToStage().size()); LOG.debug("Classpath elements: {}", dataflowOptions.getFilesToStage());
options.setRunner(DataflowRunner.class); options.setProject(projectId); options.setStagingLocation(workingBucket); options.setTempLocation(workingBucket + "/temp"); options.setGcpCredential(credentials); options.setServiceAccount(accountEmail); options.setMaxNumWorkers(maxNumWorkers); options.setDiskSizeGb(diskSizeGb); options.setWorkerMachineType(machineType); options.setAutoscalingAlgorithm(AutoscalingAlgorithmType.THROUGHPUT_BASED); options.setZone(zone); options.setStreaming(isStreaming); options.setJobName(pipelineName); Pipeline pipeline = Pipeline.create(options);
public static String getGcloudCancelCommand(DataflowPipelineOptions options, String jobId) { // If using a different Dataflow API than default, prefix command with an API override. String dataflowApiOverridePrefix = ""; String apiUrl = options.getDataflowClient().getBaseUrl(); if (!apiUrl.equals(Dataflow.DEFAULT_BASE_URL)) { dataflowApiOverridePrefix = String.format("%s=%s ", ENDPOINT_OVERRIDE_ENV_VAR, apiUrl); } // Assemble cancel command from optional prefix and project/job parameters. return String.format( "%s%s jobs --project=%s cancel --region=%s %s", dataflowApiOverridePrefix, GCLOUD_DATAFLOW_PREFIX, options.getProject(), options.getRegion(), jobId); }
/** * Stages {@link DataflowPipelineOptions#getFilesToStage()}, which defaults to every file on the * classpath unless overridden, as well as {@link * DataflowPipelineDebugOptions#getOverrideWindmillBinary()} if specified. * * @see #stageFiles(List) */ @Override public List<DataflowPackage> stageDefaultFiles() { checkNotNull(options.getStagingLocation()); String windmillBinary = options.as(DataflowPipelineDebugOptions.class).getOverrideWindmillBinary(); String dataflowWorkerJar = options.getDataflowWorkerJar(); List<String> filesToStage = options.getFilesToStage(); if (windmillBinary != null) { filesToStage.add("windmill_main=" + windmillBinary); } if (dataflowWorkerJar != null && !dataflowWorkerJar.isEmpty()) { filesToStage.add("dataflow-worker.jar=" + dataflowWorkerJar); } return stageFiles(filesToStage); }
/** * Helper to tweak default pipelineOptions for import/export jobs * @param opts * @return PipelineOptions */ public static PipelineOptions tweakOptions(PipelineOptions opts) { if (!DataflowRunner.class.isAssignableFrom(opts.getRunner())) { return opts; } DataflowPipelineOptions dataflowOpts = opts.as(DataflowPipelineOptions.class); // By default, dataflow allocates 250 GB local disks, thats not necessary. Lower it unless the // user requested an explicit size if (dataflowOpts.getDiskSizeGb() == 0) { dataflowOpts.setDiskSizeGb(25); } return dataflowOpts; }
logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline); if (containsUnboundedPCollection(pipeline)) { options.setStreaming(true); List<DataflowPackage> packages = options.getStager().stageDefaultFiles(); DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class); maybeRegisterDebuggee(dataflowOptions, requestId); LOG.info("Staging pipeline description to {}", options.getStagingLocation()); byte[] serializedProtoPipeline = jobSpecification.getPipelineProto().toByteArray(); DataflowPackage stagedPipeline = options.getStager().stageToFile(serializedProtoPipeline, PIPELINE_FILE_NAME); dataflowOptions.setPipelineUrl(stagedPipeline.getLocation()); if (!isNullOrEmpty(dataflowOptions.getDataflowWorkerJar())) { List<String> experiments = dataflowOptions.getExperiments() == null ? new ArrayList<>() : new ArrayList<>(dataflowOptions.getExperiments()); experiments.add("use_staged_dataflow_worker_jar"); dataflowOptions.setExperiments(experiments); if (!isNullOrEmpty(options.getGcpTempLocation())) { newJob .getEnvironment() .setTempStoragePrefix( dataflowOptions.getPathValidator().verifyPath(options.getGcpTempLocation()));
@Test public void testToString() { DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setJobName("TestJobName"); options.setProject("test-project"); options.setTempLocation("gs://test/temp/location"); options.setGcpCredential(new TestCredential()); options.setPathValidatorClass(NoopPathValidator.class); options.setRunner(DataflowRunner.class); assertEquals("DataflowRunner#testjobname", DataflowRunner.fromOptions(options).toString()); }
private static DataflowPipelineOptions buildPipelineOptions(String... args) throws IOException { GcsUtil mockGcsUtil = mock(GcsUtil.class); when(mockGcsUtil.expand(any(GcsPath.class))) .then(invocation -> ImmutableList.of((GcsPath) invocation.getArguments()[0])); when(mockGcsUtil.bucketAccessible(any(GcsPath.class))).thenReturn(true); DataflowPipelineOptions options = PipelineOptionsFactory.fromArgs(args).as(DataflowPipelineOptions.class); options.setRunner(DataflowRunner.class); options.setGcpCredential(new TestCredential()); options.setJobName("some-job-name"); options.setProject("some-project"); options.setRegion("some-region"); options.setTempLocation(GcsPath.fromComponents("somebucket", "some/path").toString()); options.setFilesToStage(new ArrayList<>()); options.setGcsUtil(mockGcsUtil); return options; } }
options.setAppName(DataflowRunnerTest.class.getSimpleName()); options.setJobName("some-job-name"); Map<String, Object> optionsMap = (Map<String, Object>) sdkPipelineOptions.get("options"); assertThat(optionsMap, hasEntry("appName", (Object) options.getAppName())); assertThat(optionsMap, hasEntry("project", (Object) options.getProject())); assertThat( optionsMap, hasEntry("pathValidatorClass", (Object) options.getPathValidatorClass().getName())); assertThat(optionsMap, hasEntry("runner", (Object) options.getRunner().getName())); assertThat(optionsMap, hasEntry("jobName", (Object) options.getJobName())); assertThat(optionsMap, hasEntry("tempLocation", (Object) options.getTempLocation())); assertThat(optionsMap, hasEntry("stagingLocation", (Object) options.getStagingLocation())); assertThat( optionsMap, hasEntry("stableUniqueNames", (Object) options.getStableUniqueNames().toString())); assertThat(optionsMap, hasEntry("streaming", (Object) options.isStreaming())); assertThat( optionsMap, hasEntry( "numberOfWorkerHarnessThreads", (Object) options.getNumberOfWorkerHarnessThreads()));
job.setName(options.getJobName().toLowerCase()); if (options.isStreaming()) { job.setType("JOB_TYPE_STREAMING"); } else { job.setType("JOB_TYPE_BATCH"); workerPool.setDiskType(options.getWorkerDiskType()); if (options.getWorkerMachineType() != null) { workerPool.setMachineType(options.getWorkerMachineType()); if (options.getUsePublicIps() != null) { if (options.getUsePublicIps()) { workerPool.setIpConfiguration("WORKER_IP_PUBLIC"); } else { workerPool.setNumWorkers(options.getNumWorkers()); if (options.getLabels() != null) { job.setLabels(options.getLabels()); if (options.isStreaming() && !hasExperiment(options, "enable_windmill_service")) { disk.setDiskType(options.getWorkerDiskType()); workerPool.setDataDisks(Collections.singletonList(disk)); if (!isNullOrEmpty(options.getZone())) { workerPool.setZone(options.getZone());
options.setFilesToStage( ImmutableList.of( temp1.getAbsolutePath(), overridePackageName + "=" + temp2.getAbsolutePath())); options.setStagingLocation(VALID_STAGING_BUCKET); options.setTempLocation(VALID_TEMP_BUCKET); options.setTempDatasetId(cloudDataflowDataset); options.setProject(PROJECT_ID); options.setRegion(REGION_ID); options.setJobName("job"); options.setDataflowClient(buildMockDataflow()); options.setGcsUtil(mockGcsUtil); options.setGcpCredential(new TestCredential());
private static void injectMessages(BigtablePubsubOptions options) { String inputFile = options.getInputFile(); String topic = options.getPubsubTopic(); DataflowPipelineOptions copiedOptions = options.as(DataflowPipelineOptions.class); copiedOptions.setStreaming(false); copiedOptions.setNumWorkers(INJECTORNUMWORKERS); copiedOptions.setJobName(copiedOptions.getJobName() + "-injector"); Pipeline injectorPipeline = Pipeline.create(copiedOptions); injectorPipeline.apply(TextIO.read().from(inputFile)) .apply(ParDo.of(new FilterEmptyStringsFn())) .apply(PubsubIO.writeStrings().to(topic)); injectorPipeline.run().waitUntilFinish(); }
private Pipeline createTestStreamingRunner() { DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setRunner(DataflowRunner.class); options.setStreaming(true); options.setProject("someproject"); options.setGcpTempLocation("gs://staging"); options.setPathValidatorClass(NoopPathValidator.class); options.setDataflowClient(dataflow); return Pipeline.create(options); }
@Test public void testStagingLocation() { DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setPathValidatorClass(NoopPathValidator.class); options.setTempLocation("gs://temp_location"); options.setStagingLocation("gs://staging_location"); assertEquals("gs://temp_location", options.getGcpTempLocation()); assertEquals("gs://staging_location", options.getStagingLocation()); }
@Test public void testJobNameIsSet() { DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setJobName("TestJobName"); assertEquals("TestJobName", options.getJobName()); }
@Test public void testGcsUploadBufferSizeUnchangedWhenNotDefault() throws IOException { int gcsUploadBufferSizeBytes = 12345678; DataflowPipelineOptions batchOptions = buildPipelineOptions(); batchOptions.setGcsUploadBufferSizeBytes(gcsUploadBufferSizeBytes); batchOptions.setRunner(DataflowRunner.class); Pipeline.create(batchOptions); assertEquals(gcsUploadBufferSizeBytes, batchOptions.getGcsUploadBufferSizeBytes().intValue()); DataflowPipelineOptions streamingOptions = buildPipelineOptions(); streamingOptions.setStreaming(true); streamingOptions.setGcsUploadBufferSizeBytes(gcsUploadBufferSizeBytes); streamingOptions.setRunner(DataflowRunner.class); Pipeline.create(streamingOptions); assertEquals( gcsUploadBufferSizeBytes, streamingOptions.getGcsUploadBufferSizeBytes().intValue()); }
@Test public void testUserNameIsNotSet() { resetDateTimeProviderRule.setDateTimeFixed("2014-12-08T19:07:06.698Z"); System.getProperties().remove("user.name"); DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); options.setAppName("TestApplication"); List<String> nameComponents = Splitter.on('-').splitToList(options.getJobName()); assertEquals(4, nameComponents.size()); assertEquals("testapplication", nameComponents.get(0)); assertEquals("", nameComponents.get(1)); assertEquals("1208190706", nameComponents.get(2)); // Verify the last component is a hex integer (unsigned). Long.parseLong(nameComponents.get(3), 16); assertTrue(options.getJobName().length() <= 40); }
/** * Test that in translation the name for a collection (in this case just a Create output) is * overridden to be what the Dataflow service expects. */ @Test public void testNamesOverridden() throws Exception { DataflowPipelineOptions options = buildPipelineOptions(); DataflowRunner runner = DataflowRunner.fromOptions(options); options.setStreaming(false); DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options); Pipeline pipeline = Pipeline.create(options); pipeline.apply("Jazzy", Create.of(3)).setName("foobizzle"); runner.replaceTransforms(pipeline); Job job = translator.translate(pipeline, runner, Collections.emptyList()).getJob(); // The Create step Step step = job.getSteps().get(0); // This is the name that is "set by the user" that the Dataflow translator must override String userSpecifiedName = getString( Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null).get(0), PropertyNames.USER_NAME); // This is the calculated name that must actually be used String calculatedName = getString(step.getProperties(), PropertyNames.USER_NAME) + ".out0"; assertThat(userSpecifiedName, equalTo(calculatedName)); }
@Test public void testInvalidNumberOfWorkerHarnessThreads() throws IOException { DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class); FileSystems.setDefaultPipelineOptions(options); options.setRunner(DataflowRunner.class); options.setProject("foo-12345"); options.setGcpTempLocation(VALID_TEMP_BUCKET); options.setGcsUtil(mockGcsUtil); options.as(DataflowPipelineDebugOptions.class).setNumberOfWorkerHarnessThreads(-1); thrown.expect(IllegalArgumentException.class); thrown.expectMessage("Number of worker harness threads"); thrown.expectMessage("Please make sure the value is non-negative."); DataflowRunner.fromOptions(options); }
@Test public void testGcsUploadBufferSizeIsSetForStreamingWhenDefault() throws IOException { DataflowPipelineOptions streamingOptions = buildPipelineOptions(); streamingOptions.setStreaming(true); streamingOptions.setRunner(DataflowRunner.class); Pipeline p = Pipeline.create(streamingOptions); // Instantiation of a runner prior to run() currently has a side effect of mutating the options. // This could be tested by DataflowRunner.fromOptions(streamingOptions) but would not ensure // that the pipeline itself had the expected options set. p.run(); assertEquals( DataflowRunner.GCS_UPLOAD_BUFFER_SIZE_BYTES_DEFAULT, streamingOptions.getGcsUploadBufferSizeBytes().intValue()); }