public static final JavaRDD<String> filterIPAddress( JavaPairRDD<String, Long> ipAddressCount) { return ipAddressCount .filter(new IpCountGreaterThan10()) .keys(); }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile"); } String master = args[0]; String inputFile = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> input = sc.textFile(inputFile); PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() { @Override public Tuple2<String, String> call(String x) { return new Tuple2(x.split(" ")[0], x); } }; Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() { @Override public Boolean call(Tuple2<String, String> input) { return (input._2().length() < 20); } }; JavaPairRDD<String, String> rdd = input.mapToPair(keyData); JavaPairRDD<String, String> result = rdd.filter(longWordFilter); Map<String, String> resultMap = result.collectAsMap(); for (Entry<String, String> entry : resultMap.entrySet()) { System.out.println(entry.getKey() + ":" + entry.getValue()); } } }
timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@SuppressWarnings("unchecked") @Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); assertEquals(5, joined.size()); Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); assertEquals(3, firstUnmatched._1().intValue()); }
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@SuppressWarnings("unchecked") @Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); assertEquals(5, joined.size()); Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); assertEquals(3, firstUnmatched._1().intValue()); }
@SuppressWarnings("unchecked") @Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); assertEquals(5, joined.size()); Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); assertEquals(3, firstUnmatched._1().intValue()); }
aggregated.filter(kv -> !Double.isNaN(kv._2()));
/** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) { JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples = original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next); } JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN = aggregated.filter(kv -> !Double.isNaN(kv._2())); if (logStrength) { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), Math.log1p(userProductScore._2() / epsilon))); } else { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), userProductScore._2())); } }
anagrams.filter((Tuple2<String, Map<String, Integer>> entry) -> { Map<String, Integer> map = entry._2; if (map.size() > 1) {
= anagramsAsSet.filter((Tuple2<String, Set<String>> entry) -> { Set<String> set = entry._2; if (set.size() > 1) {
for (int i = 0; i < centroids.size(); i++) { final int index = i; List<Tuple2<String, Vector>> samples = data.filter(new Function<Tuple2<String, Vector>, Boolean>() { @Override public Boolean call(Tuple2<String, Vector> in) throws Exception {
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) {
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) {
double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2())) .count() / (double) test.count();
extracted.filter((Tuple2<Tuple3<String, String, String>, LogStatistics> s) -> { Tuple3<String, String, String> t3 = s._1; return (t3._1() != null); // exclude Tuple3(null,null,null)
extracted.filter(new Function< Tuple2<Tuple3<String, String, String>, LogStatistics>, Boolean