Refine search
@Override public PCollection<KV<TableDestination, TableRow>> expand( PCollection<KV<DestinationT, TableRow>> input) { List<PCollectionView<?>> sideInputs = Lists.newArrayList(); sideInputs.addAll(dynamicDestinations.getSideInputs()); return input.apply(ParDo.of(new CreateTablesFn()).withSideInputs(sideInputs)); }
@Test public void getMainInputSingleOutputSideInputs() { AppliedPTransform<PCollection<Long>, ?, ?> application = AppliedPTransform.of( "application", ImmutableMap.<TupleTag<?>, PValue>builder() .put(new TupleTag<Long>(), mainInput) .put(sideInput.getTagInternal(), sideInput.getPCollection()) .build(), Collections.singletonMap(new TupleTag<Long>(), output), ParDo.of(new TestDoFn()).withSideInputs(sideInput), pipeline); PCollection<Long> input = PTransformReplacements.getSingletonMainInput(application); assertThat(input, equalTo(mainInput)); }
@Override public PCollection<KV<Integer, float[]>> expand(final PCollection<KV<Integer, float[]>> itemMatrix) { // Make Item Matrix view. final PCollectionView<Map<Integer, float[]>> itemMatrixView = itemMatrix.apply(GroupByKey.create()).apply(ParDo.of(new UngroupSingleVectorList())).apply(View.asMap()); // Get new User Matrix final PCollectionView<Map<Integer, float[]>> userMatrixView = parsedUserData .apply(ParDo.of(new CalculateNextMatrix(numFeatures, lambda, itemMatrixView)).withSideInputs(itemMatrixView)) .apply(GroupByKey.create()).apply(ParDo.of(new UngroupSingleVectorList())).apply(View.asMap()); // return new Item Matrix return parsedItemData.apply(ParDo.of(new CalculateNextMatrix(numFeatures, lambda, userMatrixView)) .withSideInputs(userMatrixView)); } }
@Test public void getMainInputNoMainInputsThrows() { ImmutableMap<TupleTag<?>, PValue> inputs = ImmutableMap.<TupleTag<?>, PValue>builder() .put(sideInput.getTagInternal(), sideInput.getPCollection()) .build(); AppliedPTransform<PCollection<Long>, ?, ?> application = AppliedPTransform.of( "application", inputs, Collections.singletonMap(new TupleTag<Long>(), output), ParDo.of(new TestDoFn()).withSideInputs(sideInput), pipeline); thrown.expect(IllegalArgumentException.class); thrown.expectMessage("No main input"); PTransformReplacements.getSingletonMainInput(application); }
@Override public PCollection<Mutation> expand(PCollection<KV<String, String>> input) { return input.apply( ParDo.of( new DoFn<KV<String, String>, Mutation>() { @ProcessElement public void processElement(ProcessContext c) { KV<String, String> kv = c.element(); Ddl ddl = c.sideInput(ddlView); String tableName = kv.getKey(); Table table = ddl.table(tableName); SerializableFunction<GenericRecord, Mutation> parseFn = new AvroRecordConverter(table); AvroSource<Mutation> source = AvroSource.from(kv.getValue()) .withParseFn(parseFn, SerializableCoder.of(Mutation.class)); try { BoundedSource.BoundedReader<Mutation> reader = source.createReader(c.getPipelineOptions()); for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } catch (IOException e) { throw new RuntimeException(e); } } }) .withSideInputs(ddlView)); } }
@Test public void getMainInputExtraMainInputsThrows() { PCollection<Long> notInParDo = pipeline.apply("otherPCollection", Create.of(1L, 2L, 3L)); ImmutableMap<TupleTag<?>, PValue> inputs = ImmutableMap.<TupleTag<?>, PValue>builder() .putAll(mainInput.expand()) // Not represnted as an input .put(new TupleTag<Long>(), notInParDo) .put(sideInput.getTagInternal(), sideInput.getPCollection()) .build(); AppliedPTransform<PCollection<Long>, ?, ?> application = AppliedPTransform.of( "application", inputs, Collections.singletonMap(new TupleTag<Long>(), output), ParDo.of(new TestDoFn()).withSideInputs(sideInput), pipeline); thrown.expect(IllegalArgumentException.class); thrown.expectMessage("multiple inputs"); thrown.expectMessage("not additional inputs"); thrown.expectMessage(mainInput.toString()); thrown.expectMessage(notInParDo.toString()); PTransformReplacements.getSingletonMainInput(application); }
PCollection<Double> meanTemps = rows.apply(ParDo.of(new ExtractTempFn())); meanTemps.apply(Mean.globally()).apply(View.asSingleton()); rows.apply(ParDo.of(new FilterSingleMonthDataFn(monthFilter))); monthFilteredRows.apply( "ParseAndFilter", ParDo.of( new DoFn<TableRow, TableRow>() { @ProcessElement .withSideInputs(globalMeanTemp));
@Override public PCollection<KV<Integer, float[]>> expand(final PCollection<KV<Integer, float[]>> itemMatrix) { // Parse data for user final PCollection<KV<Integer, KV<int[], float[]>>> parsedUserData = rawData .apply(ParDo.of(new AlternatingLeastSquare.ParseLine(true))) .apply(Combine.perKey(new AlternatingLeastSquare.TrainingDataCombiner())); // Make Item Matrix view. final PCollectionView<Map<Integer, float[]>> itemMatrixView = itemMatrix.apply(View.asMap()); // Get new User Matrix final PCollectionView<Map<Integer, float[]>> userMatrixView = parsedUserData .apply(ParDo.of(new AlternatingLeastSquare.CalculateNextMatrix(numFeatures, lambda, itemMatrixView)) .withSideInputs(itemMatrixView)) .apply(View.asMap()); // return new Item Matrix return parsedItemData.apply( ParDo.of(new AlternatingLeastSquare.CalculateNextMatrix(numFeatures, lambda, userMatrixView)) .withSideInputs(userMatrixView)); } }
@Override public PCollection<KV<Integer, KV<KeyT, ValueT>>> expand(PCollection<KV<KeyT, ValueT>> input) { return input .apply( "AssignTask", ParDo.of(new AssignTaskFn<KeyT, ValueT>(configView)).withSideInputs(configView)) .setTypeDescriptor( TypeDescriptors.kvs(TypeDescriptors.integers(), input.getTypeDescriptor())) .apply("GroupByTaskId", GroupByKey.create()) .apply("FlattenGroupedTasks", ParDo.of(new FlattenGroupedTasks<>())); } }
@Override public PCollection<KV<String, String>> expand(PCollection<KV<String, String>> input) { // reparallelize mimics the same behavior as in JdbcIO // breaking fusion PCollectionView<Iterable<KV<String, String>>> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection<KV<String, String>> materialized = input.apply( "Identity", ParDo.of( new DoFn<KV<String, String>, KV<String, String>>() { @ProcessElement public void processElement(ProcessContext context) { context.output(context.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } }
private PCollection<Result<DestinationT>> writeShardedRecords( PCollection<KV<ShardedKey<DestinationT>, TableRow>> shardedRecords, PCollectionView<String> tempFilePrefix) { return shardedRecords .apply("GroupByDestination", GroupByKey.create()) .apply( "WriteGroupedRecords", ParDo.of(new WriteGroupedRecordsToFiles<DestinationT>(tempFilePrefix, maxFileSize)) .withSideInputs(tempFilePrefix)) .setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder)); }
@Override public PCollection<T> expand(PCollection<T> input) { // See https://issues.apache.org/jira/browse/BEAM-2803 // We use a combined approach to "break fusion" here: // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion) // 1) force the data to be materialized by passing it as a side input to an identity fn, // then 2) reshuffle it with a random key. Initial materialization provides some parallelism // and ensures that data to be shuffled can be generated in parallel, while reshuffling // provides perfect parallelism. // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient. // The current approach is necessary only to support the particular case of JdbcIO where // a single query may produce many gigabytes of query results. PCollectionView<Iterable<T>> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection<T> materialized = input.apply( "Identity", ParDo.of( new DoFn<T, T>() { @ProcessElement public void process(ProcessContext c) { c.output(c.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } }
@Override public PCollection<KV<DestinationT, TableRow>> expand(PCollection<T> input) { return input.apply( ParDo.of( new DoFn<T, KV<DestinationT, TableRow>>() { @ProcessElement .withSideInputs(dynamicDestinations.getSideInputs()));
@Override public PCollection<Ddl> expand(PBegin p) { return p.apply("Create empty", Create.of((Void) null)) .apply( "Read Information Schema", ParDo.of(new ReadInformationSchemaFn(spannerConfig, tx)).withSideInputs(tx)); }
@Override public PCollection<T> expand(PCollection<T> input) { // See https://issues.apache.org/jira/browse/BEAM-2803 // We use a combined approach to "break fusion" here: // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion) // 1) force the data to be materialized by passing it as a side input to an identity fn, // then 2) reshuffle it with a random key. Initial materialization provides some parallelism // and ensures that data to be shuffled can be generated in parallel, while reshuffling // provides perfect parallelism. // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient. // The current approach is necessary only to support the particular case of JdbcIO where // a single query may produce many gigabytes of query results. PCollectionView<Iterable<T>> empty = input .apply("Consume", Filter.by(SerializableFunctions.constant(false))) .apply(View.asIterable()); PCollection<T> materialized = input.apply( "Identity", ParDo.of( new DoFn<T, T>() { @ProcessElement public void process(ProcessContext c) { c.output(c.element()); } }) .withSideInputs(empty)); return materialized.apply(Reshuffle.viaRandomKey()); } }
@Override public PCollection<OutputT> expand(PCollection<KV<K, InputT>> input) { DoFn<KV<K, InputT>, OutputT> fn = originalParDo.getFn(); verifyFnIsStateful(fn); DataflowRunner.verifyStateSupported(fn); DataflowRunner.verifyStateSupportForWindowingStrategy(input.getWindowingStrategy()); if (isFnApi) { return input .apply(GroupByKey.create()) .apply(ParDo.of(new ExpandGbkFn<>())) .apply(originalParDo); } PTransform< PCollection<? extends KV<K, Iterable<KV<Instant, WindowedValue<KV<K, InputT>>>>>>, PCollection<OutputT>> statefulParDo = ParDo.of(new BatchStatefulDoFn<>(fn)).withSideInputs(originalParDo.getSideInputs()); return input.apply(new GbkBeforeStatefulParDo<>()).apply(statefulParDo); } }
@Override public PCollection<KV<K, V>> expand(PCollection<K> input) { return input .apply( ParDo.of( new DoFn<K, KV<K, V>>() { @ProcessElement public void process(ProcessContext c) { c.output(KV.of(c.element(), c.sideInput(view))); } }) .withSideInputs(view)) .setCoder(KvCoder.of(input.getCoder(), coder)); } }
private PCollectionView<String> createTempFilePrefixView( Pipeline p, final PCollectionView<String> jobIdView) { return p.apply(Create.of("")) .apply( "GetTempFilePrefix", ParDo.of( new DoFn<String, String>() { @ProcessElement public void getTempFilePrefix(ProcessContext c) { String tempLocationRoot; if (customGcsTempLocation != null) { tempLocationRoot = customGcsTempLocation.get(); } else { tempLocationRoot = c.getPipelineOptions().getTempLocation(); } String tempLocation = resolveTempLocation( tempLocationRoot, "BigQueryWriteTemp", c.sideInput(jobIdView)); LOG.info( "Writing BigQuery temporary files to {} before loading them.", tempLocation); c.output(tempLocation); } }) .withSideInputs(jobIdView)) .apply("TempFilePrefixView", View.asSingleton()); }
@Override public PCollection<OutputT> expand(PCollection<? extends InputT> input) { checkArgument(fn != null, ".via() is required"); return input.apply( "FlatMap", ParDo.of( new DoFn<InputT, OutputT>() { @ProcessElement .withSideInputs(fn.getRequirements().getSideInputs()));
userScores.apply("UserSum", Sum.integersPerKey()); sumScores.apply(Values.create()).apply(Mean.<Integer>globally().asSingletonView()); ParDo .of( new DoFn<KV<String, Integer>, KV<String, Integer>>() { private final Counter numSpammerUsers = .withSideInputs(globalMeanScore)); return filtered;