org.apache.spark.api.java.JavaPairRDD java code examples

Refine search

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
   original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
 if (implicit) {
  // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
  // they don't guarantee the delete elements are properly handled
  aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
 } else {
  // For non-implicit, last wins.
  aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
 }
 JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
   aggregated.filter(kv -> !Double.isNaN(kv._2()));
 if (logStrength) {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    Math.log1p(userProductScore._2() / epsilon)));
 } else {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    userProductScore._2()));
 }
}

/**
 * Spark job to check whether Spark executors can recognize Alluxio filesystem.
 *
 * @param sc current JavaSparkContext
 * @param reportWriter save user-facing messages to a generated file
 * @return Spark job result
 */
private Status runSparkJob(JavaSparkContext sc, PrintWriter reportWriter) {
 // Generate a list of integer for testing
 List<Integer> nums = IntStream.rangeClosed(1, mPartitions).boxed().collect(Collectors.toList());
 JavaRDD<Integer> dataSet = sc.parallelize(nums, mPartitions);
 // Run a Spark job to check whether Spark executors can recognize Alluxio
 JavaPairRDD<Status, String> extractedStatus = dataSet
   .mapToPair(s -> new Tuple2<>(CheckerUtils.performIntegrationChecks(),
     CheckerUtils.getLocalAddress()));
 // Merge the IP addresses that can/cannot recognize Alluxio
 JavaPairRDD<Status, String> mergeStatus = extractedStatus.reduceByKey((a, b)
   -> a.contains(b) ?  a : (b.contains(a) ? b : a + " " + b),
   (mPartitions < 10 ? 1 : mPartitions / 10));
 mSparkJobResult = mergeStatus.collect();
 Map<Status, List<String>> resultMap = new HashMap<>();
 for (Tuple2<Status, String> op : mSparkJobResult) {
  List<String> addresses = resultMap.getOrDefault(op._1, new ArrayList<>());
  addresses.add(op._2);
  resultMap.put(op._1, addresses);
 }
 return CheckerUtils.printNodesResults(resultMap, reportWriter);
}

                    Broadcast<? extends Map<String,Integer>> bUserIDToIndex,
                    Broadcast<? extends Map<String,Integer>> bItemIDToIndex) {
JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> {
 try {
  return new Tuple2<>(
    Long.valueOf(tokens[3]),
    new Rating(bUserIDToIndex.value().get(tokens[0]),
 double factor = decayFactor;
 long now = System.currentTimeMillis();
 timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> {
   long timestamp = timestampRating._1();
   return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor));
  });
 timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold);
return timestampRatingRDD.sortByKey().values();

public static final JavaRDD<String> filterIPAddress(
  JavaPairRDD<String, Long> ipAddressCount) {
 return ipAddressCount
  .filter(new IpCountGreaterThan10())
  .keys();
}

public static Map<String,Integer> countDistinctOtherWords(JavaPairRDD<String,String> data) {
 return data.values().flatMapToPair(line -> {
  Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" ")));
  return distinctTokens.stream().flatMap(a ->
   distinctTokens.stream().filter(b -> !a.equals(b)).map(b -> new Tuple2<>(a, b))
  ).iterator();
 }).distinct().mapValues(a -> 1).reduceByKey((c1, c2) -> c1 + c2).collectAsMap();
}

private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD,
                            boolean user) {
 int offset = user ? 0 : 1;
 Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset])
   .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions())
   .zipWithIndex().mapValues(Long::intValue)
   .collectAsMap();
 // Clone, due to some serialization problems with the result of collectAsMap?
 return new HashMap<>(reverseIDLookup);
}

THE_LOGGER.info("queryInputPath=" + queryInputPath);
THE_LOGGER.info("savedModelPath=" + savedModelPath);
JavaRDD<String> query = context.textFile(queryInputPath);
final LogisticRegressionModel model = LogisticRegressionModel.load(context.sc(), savedModelPath);
JavaPairRDD<String, Double> classifications = query.mapToPair(
  new PairFunction<String, String, Double>() {
  @Override
    THE_LOGGER.info("classification="+classification);
    return new Tuple2<String, Double>(patientID, classification);
Iterable<Tuple2<String, Double>> predictions = classifications.collect();
for (Tuple2<String, Double> pair : predictions) {
  THE_LOGGER.info("query: patientID="+pair._1);
  THE_LOGGER.info("prediction="+pair._2);
context.stop();

THE_LOGGER.info("--- queryDataPath=" + queryDataPath);
THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
JavaRDD<String> queryRDD = context.textFile(queryDataPath);
JavaRDD<Vector> query = Util.createFeatureVector(queryRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
    = query.mapToPair(new PairFunction<Vector, Vector, Double>() {
      @Override
      public Tuple2<Vector, Double> call(Vector v) {
Iterable<Tuple2<Vector, Double>> predictions = predictionAndLabel.collect();
for (Tuple2<Vector, Double> p : predictions) {
  THE_LOGGER.info("input: "+ p._1);
  THE_LOGGER.info("prediction: "+ p._2);
context.close();

THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
THE_LOGGER.info("--- testDataPath=" + testDataPath);
SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel");
JavaSparkContext context = new JavaSparkContext(sparkConf);
JavaRDD<String> testRDD = context.textFile(testDataPath);        
JavaRDD<LabeledPoint> test  = Util.createLabeledPointRDD(testRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
  test.mapToPair((LabeledPoint p) -> new Tuple2<Double, Double>(model.predict(p.features()), p.label()));
double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2()))
                  .count() / (double) test.count();
THE_LOGGER.info("accuracy="+accuracy);

THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
THE_LOGGER.info("--- testDataPath=" + testDataPath);
SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel");
JavaSparkContext context = new JavaSparkContext(sparkConf);
JavaRDD<String> testRDD = context.textFile(testDataPath);        
JavaRDD<LabeledPoint> test  = Util.createLabeledPointRDD(testRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
  @Override 
  public Tuple2<Double, Double> call(LabeledPoint p) {
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
  @Override 
  public Boolean call(Tuple2<Double, Double> pl) {
    return pl._1().equals(pl._2());
}).count() / (double) test.count();
THE_LOGGER.info("accuracy="+accuracy);

public static void main(String[] args) throws Exception {
  JavaSparkContext context = new JavaSparkContext();
  THE_LOGGER.info("Number of data records " + data.count());
  THE_LOGGER.info("Done selecting initial centroids: " + centroids.size());
    JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids);
    JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey();
    Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup);
      centroids.set(t.getKey(), t.getValue());
    THE_LOGGER.info("Finished iteration (delta = " + tempDist + ")");
  for (int i = 0; i < centroids.size(); i++) {
    final int index = i;
    List<Tuple2<String, Vector>> samples = data.filter(new Function<Tuple2<String, Vector>, Boolean>() {
      @Override
      public Boolean call(Tuple2<String, Vector> in) throws Exception {
        return Util.closestPoint(in._2(), centroids) == index;
    }).take(numArticles);
      THE_LOGGER.info(sample._1());
    THE_LOGGER.info("");

THE_LOGGER.info("--- testDataPath=" + testDataPath);
THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
JavaRDD<String> testRDD = context.textFile(testDataPath);        
JavaRDD<LabeledPoint> test  = Util.createLabeledPointRDD(testRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
  @Override 
  public Tuple2<Double, Double> call(LabeledPoint p) {
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
  @Override 
  public Boolean call(Tuple2<Double, Double> pl) {
    return pl._1().equals(pl._2());
}).count() / (double) test.count();
THE_LOGGER.info("accuracy="+accuracy);
context.close();

@SuppressWarnings("unchecked")
@Test
public void mapOnPairRDD() {
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4));
 JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2));
 JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1()));
 assertEquals(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(0, 2),
  new Tuple2<>(1, 3),
  new Tuple2<>(0, 4)), rdd3.collect());
}

@Test
public void groupBy() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13));
 Function<Integer, Boolean> isOdd = x -> x % 2 == 0;
 JavaPairRDD<Boolean, Iterable<Integer>> oddsAndEvens = rdd.groupBy(isOdd);
 assertEquals(2, oddsAndEvens.count());
 assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0)));  // Evens
 assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds
 oddsAndEvens = rdd.groupBy(isOdd, 1);
 assertEquals(2, oddsAndEvens.count());
 assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0)));  // Evens
 assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds
}

@Test
public void leftOuterJoin() {
 JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 1),
  new Tuple2<>(1, 2),
  new Tuple2<>(2, 1),
  new Tuple2<>(3, 1)
 ));
 JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>(1, 'x'),
  new Tuple2<>(2, 'y'),
  new Tuple2<>(2, 'z'),
  new Tuple2<>(4, 'w')
 ));
 List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined =
  rdd1.leftOuterJoin(rdd2).collect();
 Assert.assertEquals(5, joined.size());
 Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched =
  rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first();
 Assert.assertEquals(3, firstUnmatched._1().intValue());
}

@Test
public void map() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache();
 doubles.collect();
 JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x))
  .cache();
 pairs.collect();
 JavaRDD<String> strings = rdd.map(Object::toString).cache();
 strings.collect();
}

@SuppressWarnings("unchecked")
@Test
public void lookup() {
 JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList(
  new Tuple2<>("Apples", "Fruit"),
  new Tuple2<>("Oranges", "Fruit"),
  new Tuple2<>("Oranges", "Citrus")
 ));
 assertEquals(2, categories.lookup("Oranges").size());
 assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0)));
}

System.out.println("Properties: " + System.getProperties());
SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob");
  sparkConf.setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4);
  dataSet = dataSet.filter(clientFilter);
}).persist(StorageLevel.MEMORY_AND_DISK());
List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {
  JavaPairRDD<String, ActionData> filtered_by_client = pairs.filter(new Function<Tuple2<String, ActionData>, Boolean>() {
  JavaPairRDD<String, ActionData> nonZeroUserIds = filtered_by_client.filter(new Function<Tuple2<String, ActionData>, Boolean>() {
  JavaPairRDD<String, Integer> userIdLookupRDD = nonZeroUserIds.mapToPair(new PairFunction<Tuple2<String, ActionData>, String, Integer>() {
  Map<String, Integer> userIdLookupMap = userIdLookupRDD.collectAsMap();
  Map<String, Integer> userIdLookupMap_wrapped = new HashMap<String, Integer>(userIdLookupMap);
  final Broadcast<Map<String, Integer>> broadcastVar = jsc.broadcast(userIdLookupMap_wrapped);
  JavaRDD<String> json_only_with_zeros = filtered_by_client.map(new Function<Tuple2<String, ActionData>, String>() {

public static void main(String[] args) {
 SparkConf sparkConf = new SparkConf().setAppName("basic log query");
 JavaSparkContext sc = new JavaSparkContext(sparkConf);
   logs  = sc.textFile(args[0]);
   logs = sc.parallelize(EXAMPLE_LOGS);
   logs.mapToPair((String logRecord) -> {
     String[] tokens = logRecord.split(",");
     Tuple3<String, String, String> key = Util.createKey(tokens);
     LogStatistics value = Util.createLogStatistics(tokens);
     return new Tuple2<Tuple3<String, String, String>, LogStatistics>(key, value);
 });
   extracted.filter((Tuple2<Tuple3<String, String, String>, LogStatistics> s) -> { 
     Tuple3<String, String, String> t3 = s._1;
     return (t3._1() != null); // exclude Tuple3(null,null,null)
   filtered.reduceByKey((LogStatistics stats, LogStatistics stats2) -> stats.merge(stats2));
 List<Tuple2<Tuple3<String, String, String>, LogStatistics>> output = counts.collect();
 for (Tuple2<?,?> t : output) {
   System.out.println(t._1() + "\t" + t._2());

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

Most used methods

Popular in Java

Creating JSON documents from java classes using gson
addToBackStack (FragmentTransaction)
setContentView (Activity)
onCreateOptionsMenu (Activity)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Format (java.text)
The base class for all formats. This is an abstract base class which specifies the protocol for clas
Options (org.apache.commons.cli)
Main entry-point into the library. Options represents a collection of Option objects, which describ
Kernel (java.awt.image)
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top PhpStorm plugins

How to useJavaPairRDD in org.apache.spark.api.java

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD (Showing top 20 results out of 1,062)

Refine search

How to use
JavaPairRDD
in
org.apache.spark.api.java