org.apache.spark.api.java.JavaPairRDD.filter java code examples

public static final JavaRDD<String> filterIPAddress(
  JavaPairRDD<String, Long> ipAddressCount) {
 return ipAddressCount
  .filter(new IpCountGreaterThan10())
  .keys();
}

 public static void main(String[] args) throws Exception {
    if (args.length != 2) {
   throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
    }
  String master = args[0];
  String inputFile = args[1];

    JavaSparkContext sc = new JavaSparkContext(
   master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  JavaRDD<String> input = sc.textFile(inputFile);
  PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {
   @Override
   public Tuple2<String, String> call(String x) {
    return new Tuple2(x.split(" ")[0], x);
   }
  };
  Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {
   @Override
   public Boolean call(Tuple2<String, String> input) {
    return (input._2().length() < 20);
   }
  };
  JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
  JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
  Map<String, String> resultMap = result.collectAsMap();
  for (Entry<String, String> entry : resultMap.entrySet()) {
   System.out.println(entry.getKey() + ":" + entry.getValue());
  }
  }
}

timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);

final String currentClient = client;
JavaPairRDD<String, ActionData> filtered_by_client = pairs.filter(new Function<Tuple2<String, ActionData>, Boolean>() {
JavaPairRDD<String, ActionData> nonZeroUserIds = filtered_by_client.filter(new Function<Tuple2<String, ActionData>, Boolean>() {

@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
 ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 Assert.assertEquals(5, joined.size());
 Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 Assert.assertEquals(3, firstUnmatched._1().intValue());
}

@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
 ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 Assert.assertEquals(5, joined.size());
 Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 Assert.assertEquals(3, firstUnmatched._1().intValue());
}

@SuppressWarnings("unchecked")
@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
  ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 assertEquals(5, joined.size());
 Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 assertEquals(3, firstUnmatched._1().intValue());
}

@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
 ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 Assert.assertEquals(5, joined.size());
 Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 Assert.assertEquals(3, firstUnmatched._1().intValue());
}

@SuppressWarnings("unchecked")
@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
  ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 assertEquals(5, joined.size());
 Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 assertEquals(3, firstUnmatched._1().intValue());
}

@SuppressWarnings("unchecked")
@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
  ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer,Tuple2<Integer,Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 assertEquals(5, joined.size());
 Tuple2<Integer,Tuple2<Integer,Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 assertEquals(3, firstUnmatched._1().intValue());
}

aggregated.filter(kv -> !Double.isNaN(kv._2()));

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
   original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
 if (implicit) {
  // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
  // they don't guarantee the delete elements are properly handled
  aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
 } else {
  // For non-implicit, last wins.
  aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
 }
 JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
   aggregated.filter(kv -> !Double.isNaN(kv._2()));
 if (logStrength) {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    Math.log1p(userProductScore._2() / epsilon)));
 } else {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    userProductScore._2()));
 }
}

anagrams.filter((Tuple2<String, Map<String, Integer>> entry) -> {
  Map<String, Integer> map = entry._2;
  if (map.size() > 1) {

= anagramsAsSet.filter((Tuple2<String, Set<String>> entry) -> {
  Set<String> set = entry._2;
  if (set.size() > 1) {

for (int i = 0; i < centroids.size(); i++) {
  final int index = i;
  List<Tuple2<String, Vector>> samples = data.filter(new Function<Tuple2<String, Vector>, Boolean>() {
    @Override
    public Boolean call(Tuple2<String, Vector> in) throws Exception {

double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
  @Override 
  public Boolean call(Tuple2<Double, Double> pl) {

double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
  @Override 
  public Boolean call(Tuple2<Double, Double> pl) {

double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2()))
                  .count() / (double) test.count();

extracted.filter((Tuple2<Tuple3<String, String, String>, LogStatistics> s) -> { 
  Tuple3<String, String, String> t3 = s._1;
  return (t3._1() != null); // exclude Tuple3(null,null,null)

extracted.filter(new Function<
               Tuple2<Tuple3<String, String, String>, LogStatistics>, 
               Boolean

Popular methods of JavaPairRDD

Popular in Java

Parsing JSON documents to java classes using gson
setContentView (Activity)
getSupportFragmentManager (FragmentActivity)
getExternalFilesDir (Context)
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
Vector (java.util)
Vector is an implementation of List, backed by an array and synchronized. All optional operations in
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
JTextField (javax.swing)
Top plugins for Android Studio

How to use filtermethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.filter (Showing top 20 results out of 315)

How to use
filter
method
in
org.apache.spark.api.java.JavaPairRDD