/** * Default implementation which randomly splits new data into train/test sets. * This handles the case where {@link #getTestFraction()} is not 0 or 1. * * @param newData data that has arrived in the current input batch * @return a {@link Pair} of train, test {@link RDD}s. */ protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) { RDD<M>[] testTrainRDDs = newData.rdd().randomSplit( new double[]{1.0 - testFraction, testFraction}, RandomManager.getRandom().nextLong()); return new Pair<>(newData.wrapRDD(testTrainRDDs[0]), newData.wrapRDD(testTrainRDDs[1])); }
private static JavaPairRDD<Integer,Iterable<Rating>> predictAll( MatrixFactorizationModel mfModel, JavaRDD<Rating> data, JavaPairRDD<Integer,Integer> userProducts) { @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> userProductsRDD = (RDD<Tuple2<Object,Object>>) (RDD<?>) userProducts.rdd(); return data.wrapRDD(mfModel.predict(userProductsRDD)).groupBy(Rating::user); }
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
/** * Default implementation which randomly splits new data into train/test sets. * This handles the case where {@link #getTestFraction()} is not 0 or 1. * * @param newData data that has arrived in the current input batch * @return a {@link Pair} of train, test {@link RDD}s. */ protected Pair<JavaRDD<M>,JavaRDD<M>> splitNewDataToTrainTest(JavaRDD<M> newData) { RDD<M>[] testTrainRDDs = newData.rdd().randomSplit( new double[]{1.0 - testFraction, testFraction}, RandomManager.getRandom().nextLong()); return new Pair<>(newData.wrapRDD(testTrainRDDs[0]), newData.wrapRDD(testTrainRDDs[1])); }