org.apache.spark.api.java.JavaRDD.rdd java code examples

/**
 * @param javaRDD RDD whose underlying RDD must be an instance of {@code HasOffsetRanges},
 *  such as {@code KafkaRDD}
 */
@Override
public void call(JavaRDD<T> javaRDD) {
 OffsetRange[] ranges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();
 Map<Pair<String,Integer>,Long> newOffsets = new HashMap<>(ranges.length);
 for (OffsetRange range : ranges) {
  newOffsets.put(new Pair<>(range.topic(), range.partition()), range.untilOffset());
 }
 log.info("Updating offsets: {}", newOffsets);
 KafkaUtils.setOffsets(inputTopicLockMaster, group, newOffsets);
}

/**
 * @param sparkContext    active Spark Context
 * @param trainData       training data on which to build a model
 * @param hyperParameters ordered list of hyper parameter values to use in building model
 * @param candidatePath   directory where additional model files can be written
 * @return a {@link PMML} representation of a model trained on the given data
 */
@Override
public PMML buildModel(JavaSparkContext sparkContext,
            JavaRDD<String> trainData,
            List<?> hyperParameters,
            Path candidatePath) {
 int numClusters = (Integer) hyperParameters.get(0);
 Preconditions.checkArgument(numClusters > 1);
 log.info("Building KMeans Model with {} clusters", numClusters);
 JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN));
 KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations,
                     numberOfRuns, initializationStrategy);
 return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel));
}

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
   testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,Object>> testUserProducts =
   (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
 JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
 double mse = predictions.mapToPair(
   rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
 ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
  double diff = valuePrediction._1() - valuePrediction._2();
  return diff * diff;
 }).mean();
 return Math.sqrt(mse);
}

LogisticRegressionModel model = lrLearner.run(trainingData.rdd());

/**
 * Default implementation which randomly splits new data into train/test sets.
 * This handles the case where {@link #getTestFraction()} is not 0 or 1.
 *
 * @param newData data that has arrived in the current input batch
 * @return a {@link Pair} of train, test {@link RDD}s.
 */
protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) {
 RDD<M>[] testTrainRDDs = newData.rdd().randomSplit(
   new double[]{1.0 - testFraction, testFraction},
   RandomManager.getRandom().nextLong());
 return new Pair<>(newData.wrapRDD(testTrainRDDs[0]),
          newData.wrapRDD(testTrainRDDs[1]));
}

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

@Test
public void collectUnderlyingScalaRDD() {
 List<SomeCustomClass> data = new ArrayList<>();
 for (int i = 0; i < 100; i++) {
  data.add(new SomeCustomClass());
 }
 JavaRDD<SomeCustomClass> rdd = sc.parallelize(data);
 SomeCustomClass[] collected =
  (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect();
 assertEquals(data.size(), collected.length);
}

RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd();
trainingRatingDataRDD.cache();
MatrixFactorizationModel model = als.run(trainingRatingDataRDD);

final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

).rdd()).mean();

LogisticRegressionModel model = logisticRegression.run(trainingData.rdd());

LogisticRegressionModel model = learner.run(trainingData.rdd());

.run(training.rdd());

JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData);
Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());

@Test
public void runUsingStaticMethods() {
 JavaRDD<LabeledPoint> testRDD = jsc.parallelize(POINTS, 2).cache();
 NaiveBayesModel model1 = NaiveBayes.train(testRDD.rdd());
 int numAccurate1 = validatePrediction(POINTS, model1);
 Assert.assertEquals(POINTS.size(), numAccurate1);
 NaiveBayesModel model2 = NaiveBayes.train(testRDD.rdd(), 0.5);
 int numAccurate2 = validatePrediction(POINTS, model2);
 Assert.assertEquals(POINTS.size(), numAccurate2);
}

@Test
public void testPredictJavaRDD() {
 JavaRDD<LabeledPoint> examples = jsc.parallelize(POINTS, 2).cache();
 NaiveBayesModel model = NaiveBayes.train(examples.rdd());
 JavaRDD<Vector> vectors = examples.map(LabeledPoint::features);
 JavaRDD<Double> predictions = model.predict(vectors);
 // Should be able to get the first prediction.
 predictions.first();
}

@Test
public void runImplicitALSUsingStaticMethods() {
 int features = 1;
 int iterations = 15;
 int users = 80;
 int products = 160;
 Tuple3<List<Rating>, double[], double[]> testData =
  ALSSuite.generateRatingsAsJava(users, products, features, 0.7, true, false);
 JavaRDD<Rating> data = jsc.parallelize(testData._1());
 MatrixFactorizationModel model = ALS.trainImplicit(data.rdd(), features, iterations);
 validatePrediction(model, users, products, testData._2(), 0.4, true, testData._3());
}

Popular methods of JavaRDD

Popular in Java

Reactive rest calls using spring rest template
getSharedPreferences (Context)
getResourceAsStream (ClassLoader)
runOnUiThread (Activity)
FileWriter (java.io)
A specialized Writer that writes to a file in the file system. All write requests made by calling me
SQLException (java.sql)
An exception that indicates a failed JDBC operation. It provides the following information about pro
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
From CI to AI: The AI layer in your organization

How to use rddmethodin org.apache.spark.api.java.JavaRDD

Best Java code snippets using org.apache.spark.api.java.JavaRDD.rdd (Showing top 20 results out of 396)

How to use
rdd
method
in
org.apache.spark.api.java.JavaRDD