private static MatrixFactorizationModel pmmlToMFModel(JavaSparkContext sparkContext, PMML pmml, Path modelParentPath, Broadcast<Map<String,Integer>> bUserIDToIndex, Broadcast<Map<String,Integer>> bItemIDToIndex) { String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X"); String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)); JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)); int rank = userRDD.first()._2().length; return new MatrixFactorizationModel( rank, readAndConvertFeatureRDD(userRDD, bUserIDToIndex), readAndConvertFeatureRDD(productRDD, bItemIDToIndex)); }
@Test public void cartesian() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaRDD<String> stringRDD = sc.parallelize(Arrays.asList("Hello", "World")); JavaPairRDD<String, Double> cartesian = stringRDD.cartesian(doubleRDD); assertEquals(new Tuple2<>("Hello", 1.0), cartesian.first()); }
@Test public void sortByKey() { List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(0, 4)); pairs.add(new Tuple2<>(3, 2)); pairs.add(new Tuple2<>(-1, 1)); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); // Default comparator JavaPairRDD<Integer, Integer> sortedRDD = rdd.sortByKey(); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); // Custom comparator sortedRDD = rdd.sortByKey(Collections.reverseOrder(), false); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); }
@Test public void sortByKey() { List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(0, 4)); pairs.add(new Tuple2<>(3, 2)); pairs.add(new Tuple2<>(-1, 1)); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); // Default comparator JavaPairRDD<Integer, Integer> sortedRDD = rdd.sortByKey(); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); // Custom comparator sortedRDD = rdd.sortByKey(Collections.reverseOrder(), false); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); }
@Test public void cartesian() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaRDD<String> stringRDD = sc.parallelize(Arrays.asList("Hello", "World")); JavaPairRDD<String, Double> cartesian = stringRDD.cartesian(doubleRDD); assertEquals(new Tuple2<>("Hello", 1.0), cartesian.first()); }
@Test public void cartesian() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaRDD<String> stringRDD = sc.parallelize(Arrays.asList("Hello", "World")); JavaPairRDD<String, Double> cartesian = stringRDD.cartesian(doubleRDD); assertEquals(new Tuple2<>("Hello", 1.0), cartesian.first()); }
@Test public void sortByKey() { List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(0, 4)); pairs.add(new Tuple2<>(3, 2)); pairs.add(new Tuple2<>(-1, 1)); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); // Default comparator JavaPairRDD<Integer, Integer> sortedRDD = rdd.sortByKey(); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); List<Tuple2<Integer, Integer>> sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); // Custom comparator sortedRDD = rdd.sortByKey(Collections.reverseOrder(), false); assertEquals(new Tuple2<>(-1, 1), sortedRDD.first()); sortedPairs = sortedRDD.collect(); assertEquals(new Tuple2<>(0, 4), sortedPairs.get(1)); assertEquals(new Tuple2<>(3, 2), sortedPairs.get(2)); }
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@SuppressWarnings("unchecked") @Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); assertEquals(5, joined.size()); Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); assertEquals(3, firstUnmatched._1().intValue()); }
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@SuppressWarnings("unchecked") @Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); assertEquals(5, joined.size()); Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); assertEquals(3, firstUnmatched._1().intValue()); }
@SuppressWarnings("unchecked") @Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); assertEquals(5, joined.size()); Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); assertEquals(3, firstUnmatched._1().intValue()); }
@Test public void flatMap() { JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello World!", "The quick brown fox jumps over the lazy dog.")); JavaRDD<String> words = rdd.flatMap(x -> Arrays.asList(x.split(" ")).iterator()); Assert.assertEquals("Hello", words.first()); Assert.assertEquals(11, words.count()); JavaPairRDD<String, String> pairs = rdd.flatMapToPair(s -> { List<Tuple2<String, String>> pairs2 = new LinkedList<>(); for (String word : s.split(" ")) { pairs2.add(new Tuple2<>(word, word)); } return pairs2.iterator(); }); Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairs.first()); Assert.assertEquals(11, pairs.count()); JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> { List<Double> lengths = new LinkedList<>(); for (String word : s.split(" ")) { lengths.add((double) word.length()); } return lengths.iterator(); }); Assert.assertEquals(5.0, doubles.first(), 0.01); Assert.assertEquals(11, pairs.count()); }
@Test public void flatMap() { JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello World!", "The quick brown fox jumps over the lazy dog.")); JavaRDD<String> words = rdd.flatMap(x -> Arrays.asList(x.split(" ")).iterator()); assertEquals("Hello", words.first()); assertEquals(11, words.count()); JavaPairRDD<String, String> pairsRDD = rdd.flatMapToPair(s -> { List<Tuple2<String, String>> pairs = new LinkedList<>(); for (String word : s.split(" ")) { pairs.add(new Tuple2<>(word, word)); } return pairs.iterator(); } ); assertEquals(new Tuple2<>("Hello", "Hello"), pairsRDD.first()); assertEquals(11, pairsRDD.count()); JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> { List<Double> lengths = new LinkedList<>(); for (String word : s.split(" ")) { lengths.add((double) word.length()); } return lengths.iterator(); }); assertEquals(5.0, doubles.first(), 0.01); assertEquals(11, pairsRDD.count()); }
@Test public void flatMap() { JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello World!", "The quick brown fox jumps over the lazy dog.")); JavaRDD<String> words = rdd.flatMap(x -> Arrays.asList(x.split(" ")).iterator()); assertEquals("Hello", words.first()); assertEquals(11, words.count()); JavaPairRDD<String, String> pairsRDD = rdd.flatMapToPair(s -> { List<Tuple2<String, String>> pairs = new LinkedList<>(); for (String word : s.split(" ")) { pairs.add(new Tuple2<>(word, word)); } return pairs.iterator(); } ); assertEquals(new Tuple2<>("Hello", "Hello"), pairsRDD.first()); assertEquals(11, pairsRDD.count()); JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> { List<Double> lengths = new LinkedList<>(); for (String word : s.split(" ")) { lengths.add((double) word.length()); } return lengths.iterator(); }); assertEquals(5.0, doubles.first(), 0.01); assertEquals(11, pairsRDD.count()); }
@Test public void flatMap() { JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello World!", "The quick brown fox jumps over the lazy dog.")); JavaRDD<String> words = rdd.flatMap(x -> Arrays.asList(x.split(" ")).iterator()); Assert.assertEquals("Hello", words.first()); Assert.assertEquals(11, words.count()); JavaPairRDD<String, String> pairs = rdd.flatMapToPair(s -> { List<Tuple2<String, String>> pairs2 = new LinkedList<>(); for (String word : s.split(" ")) { pairs2.add(new Tuple2<>(word, word)); } return pairs2.iterator(); }); Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairs.first()); Assert.assertEquals(11, pairs.count()); JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> { List<Double> lengths = new LinkedList<>(); for (String word : s.split(" ")) { lengths.add((double) word.length()); } return lengths.iterator(); }); Assert.assertEquals(5.0, doubles.first(), 0.01); Assert.assertEquals(11, pairs.count()); }
@SuppressWarnings("unchecked") @Test public void persist() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY()); assertEquals(20, doubleRDD.sum(), 0.1); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY()); assertEquals("a", pairRDD.first()._2()); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); rdd = rdd.persist(StorageLevel.DISK_ONLY()); assertEquals(1, rdd.first().intValue()); }
@SuppressWarnings("unchecked") @Test public void persist() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY()); assertEquals(20, doubleRDD.sum(), 0.1); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY()); assertEquals("a", pairRDD.first()._2()); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); rdd = rdd.persist(StorageLevel.DISK_ONLY()); assertEquals(1, rdd.first().intValue()); }
@SuppressWarnings("unchecked") @Test public void persist() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY()); assertEquals(20, doubleRDD.sum(), 0.1); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY()); assertEquals("a", pairRDD.first()._2()); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); rdd = rdd.persist(StorageLevel.DISK_ONLY()); assertEquals(1, rdd.first().intValue()); }