/** * @param javaRDD RDD whose underlying RDD must be an instance of {@code HasOffsetRanges}, * such as {@code KafkaRDD} */ @Override public void call(JavaRDD<T> javaRDD) { OffsetRange[] ranges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges(); Map<Pair<String,Integer>,Long> newOffsets = new HashMap<>(ranges.length); for (OffsetRange range : ranges) { newOffsets.put(new Pair<>(range.topic(), range.partition()), range.untilOffset()); } log.info("Updating offsets: {}", newOffsets); KafkaUtils.setOffsets(inputTopicLockMaster, group, newOffsets); }
/** * @param sparkContext active Spark Context * @param trainData training data on which to build a model * @param hyperParameters ordered list of hyper parameter values to use in building model * @param candidatePath directory where additional model files can be written * @return a {@link PMML} representation of a model trained on the given data */ @Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { int numClusters = (Integer) hyperParameters.get(0); Preconditions.checkArgument(numClusters > 1); log.info("Building KMeans Model with {} clusters", numClusters); JavaRDD<Vector> trainingData = parsedToVectorRDD(trainData.map(MLFunctions.PARSE_FN)); KMeansModel kMeansModel = KMeans.train(trainingData.rdd(), numClusters, maxIterations, numberOfRuns, initializationStrategy); return kMeansModelToPMML(kMeansModel, fetchClusterCountsFromModel(trainingData, kMeansModel)); }
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
/** * Default implementation which randomly splits new data into train/test sets. * This handles the case where {@link #getTestFraction()} is not 0 or 1. * * @param newData data that has arrived in the current input batch * @return a {@link Pair} of train, test {@link RDD}s. */ protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) { RDD<M>[] testTrainRDDs = newData.rdd().randomSplit( new double[]{1.0 - testFraction, testFraction}, RandomManager.getRandom().nextLong()); return new Pair<>(newData.wrapRDD(testTrainRDDs[0]), newData.wrapRDD(testTrainRDDs[1])); }
@Test public void collectUnderlyingScalaRDD() { List<SomeCustomClass> data = new ArrayList<>(); for (int i = 0; i < 100; i++) { data.add(new SomeCustomClass()); } JavaRDD<SomeCustomClass> rdd = sc.parallelize(data); SomeCustomClass[] collected = (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect(); assertEquals(data.size(), collected.length); }
@Test public void collectUnderlyingScalaRDD() { List<SomeCustomClass> data = new ArrayList<>(); for (int i = 0; i < 100; i++) { data.add(new SomeCustomClass()); } JavaRDD<SomeCustomClass> rdd = sc.parallelize(data); SomeCustomClass[] collected = (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect(); assertEquals(data.size(), collected.length); }
@Test public void collectUnderlyingScalaRDD() { List<SomeCustomClass> data = new ArrayList<>(); for (int i = 0; i < 100; i++) { data.add(new SomeCustomClass()); } JavaRDD<SomeCustomClass> rdd = sc.parallelize(data); SomeCustomClass[] collected = (SomeCustomClass[]) rdd.rdd().retag(SomeCustomClass.class).collect(); assertEquals(data.size(), collected.length); }
RDD<Rating> trainingRatingDataRDD = trainRatingData.rdd(); trainingRatingDataRDD.cache(); MatrixFactorizationModel model = als.run(trainingRatingDataRDD);
final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
).rdd()).mean();
LogisticRegressionModel model = logisticRegression.run(trainingData.rdd());
LogisticRegressionModel model = learner.run(trainingData.rdd());
.run(training.rdd());
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());
JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd());
@Test public void runUsingStaticMethods() { JavaRDD<LabeledPoint> testRDD = jsc.parallelize(POINTS, 2).cache(); NaiveBayesModel model1 = NaiveBayes.train(testRDD.rdd()); int numAccurate1 = validatePrediction(POINTS, model1); Assert.assertEquals(POINTS.size(), numAccurate1); NaiveBayesModel model2 = NaiveBayes.train(testRDD.rdd(), 0.5); int numAccurate2 = validatePrediction(POINTS, model2); Assert.assertEquals(POINTS.size(), numAccurate2); }
@Test public void testPredictJavaRDD() { JavaRDD<LabeledPoint> examples = jsc.parallelize(POINTS, 2).cache(); NaiveBayesModel model = NaiveBayes.train(examples.rdd()); JavaRDD<Vector> vectors = examples.map(LabeledPoint::features); JavaRDD<Double> predictions = model.predict(vectors); // Should be able to get the first prediction. predictions.first(); }
@Test public void runImplicitALSUsingStaticMethods() { int features = 1; int iterations = 15; int users = 80; int products = 160; Tuple3<List<Rating>, double[], double[]> testData = ALSSuite.generateRatingsAsJava(users, products, features, 0.7, true, false); JavaRDD<Rating> data = jsc.parallelize(testData._1()); MatrixFactorizationModel model = ALS.trainImplicit(data.rdd(), features, iterations); validatePrediction(model, users, products, testData._2(), 0.4, true, testData._3()); }