@Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); Assert.assertEquals(1, sums.lookup(1).get(0).intValue()); Assert.assertEquals(2, sums.lookup(2).get(0).intValue()); Assert.assertEquals(3, sums.lookup(3).get(0).intValue()); }
@Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); Assert.assertEquals(1, sums.lookup(1).get(0).intValue()); Assert.assertEquals(2, sums.lookup(2).get(0).intValue()); Assert.assertEquals(3, sums.lookup(3).get(0).intValue()); }
@Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); Assert.assertEquals(1, sums.lookup(1).get(0).intValue()); Assert.assertEquals(2, sums.lookup(2).get(0).intValue()); Assert.assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); assertEquals(1, sums.lookup(1).get(0).intValue()); assertEquals(2, sums.lookup(2).get(0).intValue()); assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); assertEquals(1, sums.lookup(1).get(0).intValue()); assertEquals(2, sums.lookup(2).get(0).intValue()); assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); assertEquals(1, sums.lookup(1).get(0).intValue()); assertEquals(2, sums.lookup(2).get(0).intValue()); assertEquals(3, sums.lookup(3).get(0).intValue()); }
} else { aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
/** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) { JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples = original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next); } JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN = aggregated.filter(kv -> !Double.isNaN(kv._2())); if (logStrength) { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), Math.log1p(userProductScore._2() / epsilon))); } else { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), userProductScore._2())); } }
pairsRDD.foldByKey(1, (x, y) -> x * y);
} else { aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);