org.apache.spark.api.java.JavaRDD.union java code examples

private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) {
 Objects.requireNonNull(newData);
 if (testFraction <= 0.0) {
  return new Pair<>(pastData == null ? newData : newData.union(pastData), null);
 }
 if (testFraction >= 1.0) {
  return new Pair<>(pastData, newData);
 }
 if (empty(newData)) {
  return new Pair<>(pastData, null);
 }
 Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData);
 JavaRDD<M> newTrainData = newTrainTest.getFirst();
 return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData),
          newTrainTest.getSecond());
}

KMeansPMMLUtils.validatePMMLVsSchema(model, inputSchema);
JavaRDD<Vector> evalData =
  parsedToVectorRDD(trainData.union(testData).map(MLFunctions.PARSE_FN));
List<ClusterInfo> clusterInfoList = KMeansPMMLUtils.read(model);

JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples);

@Override
public void publishAdditionalModelData(JavaSparkContext sparkContext,
                    PMML pmml,
                    JavaRDD<String> newData,
                    JavaRDD<String> pastData,
                    Path modelParentPath,
                    TopicProducer<String, String> modelUpdateTopic) {
 // Send item updates first, before users. That way, user-based endpoints like /recommend
 // may take longer to not return 404, but when they do, the result will be more complete.
 log.info("Sending item / Y data as model updates");
 String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y");
 JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString));
 String updateBroker = modelUpdateTopic.getUpdateBroker();
 String topic = modelUpdateTopic.getTopic();
 // For now, there is no use in sending known users for each item
 productRDD.foreachPartition(new EnqueueFeatureVecsFn("Y", updateBroker, topic));
 log.info("Sending user / X data as model updates");
 String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X");
 JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString));
 if (noKnownItems) {
  userRDD.foreachPartition(new EnqueueFeatureVecsFn("X", updateBroker, topic));
 } else {
  log.info("Sending known item data with model updates");
  JavaRDD<String[]> allData =
    (pastData == null ? newData : newData.union(pastData)).map(MLFunctions.PARSE_FN);
  JavaPairRDD<String,Collection<String>> knownItems = knownsRDD(allData, true);
  userRDD.join(knownItems).foreachPartition(
    new EnqueueFeatureVecsAndKnownItemsFn("X", updateBroker, topic));
 }
}

JavaRDD<LabeledPoint> trainingData = spamTrainingData.union(nonSpamTrainingData);

 @Override
 @SuppressWarnings("unchecked")
 public JavaRDD<?> translate(Union operator, SparkExecutorContext context) {
  final List<JavaRDD<?>> inputs = context.getInputs(operator);
  if (inputs.size() < 2) {
   throw new IllegalStateException("Union operator needs at least 2 inputs");
  }
  return inputs
    .stream()
    .reduce(
      (l, r) ->
        ((JavaRDD<Object>) l)
          .union((JavaRDD<Object>) r)
          .setName(operator.getName()))
    .orElseThrow(() -> new IllegalArgumentException("Unable to reduce inputs."));
 }
}

@SuppressWarnings("unchecked")
@Override
public SparkCollection<T> union(SparkCollection<T> other) {
 return wrap(rdd.union((JavaRDD<T>) other.getUnderlying()));
}

@Override
public JavaRDD<T> toRDD(JavaSparkContext sc) {
  return this.one.toRDD(sc).union(this.two.toRDD(sc));
}

@Override
public SparkStream<T> union(@NonNull MStream<T> other) {
 if (other.isReusable() && other.isEmpty()) {
   return this;
 } else if (isReusable() && this.isEmpty()) {
   return new SparkStream<>(other);
 } else if (other instanceof SparkStream) {
   return new SparkStream<>(rdd.union(Cast.<SparkStream<T>>as(other).rdd));
 }
 SparkStream<T> stream = new SparkStream<>(other);
 return new SparkStream<>(rdd.union(stream.rdd));
}

private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) {
 Objects.requireNonNull(newData);
 if (testFraction <= 0.0) {
  return new Pair<>(pastData == null ? newData : newData.union(pastData), null);
 }
 if (testFraction >= 1.0) {
  return new Pair<>(pastData, newData);
 }
 if (empty(newData)) {
  return new Pair<>(pastData, null);
 }
 Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData);
 JavaRDD<M> newTrainData = newTrainTest.getFirst();
 return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData),
          newTrainTest.getSecond());
}

PairFunction<Integer, Integer, Integer> mapToTuple =
  (PairFunction<Integer, Integer, Integer>) i -> new Tuple2<>(i, i);
return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);

PairFunction<Integer, Integer, Integer> mapToTuple =
  (PairFunction<Integer, Integer, Integer>) i -> new Tuple2<>(i, i);
return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);

 PairFunction<Integer, Integer, Integer> mapToTuple =
  (Integer i) -> new Tuple2<>(i, i);
 return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);
});
JavaTestUtils.attachTestOutputStream(transformed2);

 PairFunction<Integer, Integer, Integer> mapToTuple =
  (Integer i) -> new Tuple2<>(i, i);
 return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);
});
JavaTestUtils.attachTestOutputStream(transformed2);

@Override
public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate(
    ChannelInstance[] inputs,
    ChannelInstance[] outputs,
    SparkExecutor sparkExecutor,
    OptimizationContext.OperatorContext operatorContext) {
  assert inputs.length == this.getNumInputs();
  assert outputs.length == this.getNumOutputs();
  RddChannel.Instance input0 = (RddChannel.Instance) inputs[0];
  RddChannel.Instance input1 = (RddChannel.Instance) inputs[1];
  RddChannel.Instance output = (RddChannel.Instance) outputs[0];
  final JavaRDD<Type> inputRdd0 = input0.provideRdd();
  final JavaRDD<Type> inputRdd1 = input1.provideRdd();
  final JavaRDD<Type> outputRdd = inputRdd0.union(inputRdd1);
  this.name(outputRdd);
  output.accept(outputRdd, sparkExecutor);
  return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext);
}

Popular methods of JavaRDD

Popular in Java

Running tasks concurrently on multiple threads
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
setScale (BigDecimal)
getSupportFragmentManager (FragmentActivity)
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
Collectors (java.util.stream)
DataSource (javax.sql)
An interface for the creation of Connection objects which represent a connection to a database. This
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Join (org.hibernate.mapping)
Top plugins for WebStorm

How to use unionmethodin org.apache.spark.api.java.JavaRDD

Best Java code snippets using org.apache.spark.api.java.JavaRDD.union (Showing top 15 results out of 315)

How to use
union
method
in
org.apache.spark.api.java.JavaRDD