/** * Combines {@link Rating}s with the same user/item into one, with score as the sum of * all of the scores. */ private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) { JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples = original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated; if (implicit) { // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since // they don't guarantee the delete elements are properly handled aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN); } else { // For non-implicit, last wins. aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next); } JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN = aggregated.filter(kv -> !Double.isNaN(kv._2())); if (logStrength) { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), Math.log1p(userProductScore._2() / epsilon))); } else { return noNaN.map(userProductScore -> new Rating( userProductScore._1()._1(), userProductScore._1()._2(), userProductScore._2())); } }
/** * Spark job to check whether Spark executors can recognize Alluxio filesystem. * * @param sc current JavaSparkContext * @param reportWriter save user-facing messages to a generated file * @return Spark job result */ private Status runSparkJob(JavaSparkContext sc, PrintWriter reportWriter) { // Generate a list of integer for testing List<Integer> nums = IntStream.rangeClosed(1, mPartitions).boxed().collect(Collectors.toList()); JavaRDD<Integer> dataSet = sc.parallelize(nums, mPartitions); // Run a Spark job to check whether Spark executors can recognize Alluxio JavaPairRDD<Status, String> extractedStatus = dataSet .mapToPair(s -> new Tuple2<>(CheckerUtils.performIntegrationChecks(), CheckerUtils.getLocalAddress())); // Merge the IP addresses that can/cannot recognize Alluxio JavaPairRDD<Status, String> mergeStatus = extractedStatus.reduceByKey((a, b) -> a.contains(b) ? a : (b.contains(a) ? b : a + " " + b), (mPartitions < 10 ? 1 : mPartitions / 10)); mSparkJobResult = mergeStatus.collect(); Map<Status, List<String>> resultMap = new HashMap<>(); for (Tuple2<Status, String> op : mSparkJobResult) { List<String> addresses = resultMap.getOrDefault(op._1, new ArrayList<>()); addresses.add(op._2); resultMap.put(op._1, addresses); } return CheckerUtils.printNodesResults(resultMap, reportWriter); }
Broadcast<? extends Map<String,Integer>> bUserIDToIndex, Broadcast<? extends Map<String,Integer>> bItemIDToIndex) { JavaPairRDD<Long,Rating> timestampRatingRDD = parsedRDD.mapToPair(tokens -> { try { return new Tuple2<>( Long.valueOf(tokens[3]), new Rating(bUserIDToIndex.value().get(tokens[0]), double factor = decayFactor; long now = System.currentTimeMillis(); timestampRatingRDD = timestampRatingRDD.mapToPair(timestampRating -> { long timestamp = timestampRating._1(); return new Tuple2<>(timestamp, decayRating(timestampRating._2(), timestamp, now, factor)); }); timestampRatingRDD = timestampRatingRDD.filter(timestampRating -> timestampRating._2().rating() > theThreshold); return timestampRatingRDD.sortByKey().values();
public static Map<String,Integer> countDistinctOtherWords(JavaPairRDD<String,String> data) { return data.values().flatMapToPair(line -> { Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" "))); return distinctTokens.stream().flatMap(a -> distinctTokens.stream().filter(b -> !a.equals(b)).map(b -> new Tuple2<>(a, b)) ).iterator(); }).distinct().mapValues(a -> 1).reduceByKey((c1, c2) -> c1 + c2).collectAsMap(); }
private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD, boolean user) { int offset = user ? 0 : 1; Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset]) .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions()) .zipWithIndex().mapValues(Long::intValue) .collectAsMap(); // Clone, due to some serialization problems with the result of collectAsMap? return new HashMap<>(reverseIDLookup); }
THE_LOGGER.info("queryInputPath=" + queryInputPath); THE_LOGGER.info("savedModelPath=" + savedModelPath); JavaRDD<String> query = context.textFile(queryInputPath); final LogisticRegressionModel model = LogisticRegressionModel.load(context.sc(), savedModelPath); JavaPairRDD<String, Double> classifications = query.mapToPair( new PairFunction<String, String, Double>() { @Override THE_LOGGER.info("classification="+classification); return new Tuple2<String, Double>(patientID, classification); Iterable<Tuple2<String, Double>> predictions = classifications.collect(); for (Tuple2<String, Double> pair : predictions) { THE_LOGGER.info("query: patientID="+pair._1); THE_LOGGER.info("prediction="+pair._2); context.stop();
THE_LOGGER.info("--- queryDataPath=" + queryDataPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> queryRDD = context.textFile(queryDataPath); JavaRDD<Vector> query = Util.createFeatureVector(queryRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); = query.mapToPair(new PairFunction<Vector, Vector, Double>() { @Override public Tuple2<Vector, Double> call(Vector v) { Iterable<Tuple2<Vector, Double>> predictions = predictionAndLabel.collect(); for (Tuple2<Vector, Double> p : predictions) { THE_LOGGER.info("input: "+ p._1); THE_LOGGER.info("prediction: "+ p._2); context.close();
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- testDataPath=" + testDataPath); SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel"); JavaSparkContext context = new JavaSparkContext(sparkConf); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair((LabeledPoint p) -> new Tuple2<Double, Double>(model.predict(p.features()), p.label())); double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2())) .count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy);
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- testDataPath=" + testDataPath); SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel"); JavaSparkContext context = new JavaSparkContext(sparkConf); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); }).count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy);
public static void main(String[] args) throws Exception { JavaSparkContext context = new JavaSparkContext(); THE_LOGGER.info("Number of data records " + data.count()); THE_LOGGER.info("Done selecting initial centroids: " + centroids.size()); JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids); JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey(); Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup); centroids.set(t.getKey(), t.getValue()); THE_LOGGER.info("Finished iteration (delta = " + tempDist + ")"); for (int i = 0; i < centroids.size(); i++) { final int index = i; List<Tuple2<String, Vector>> samples = data.filter(new Function<Tuple2<String, Vector>, Boolean>() { @Override public Boolean call(Tuple2<String, Vector> in) throws Exception { return Util.closestPoint(in._2(), centroids) == index; }).take(numArticles); THE_LOGGER.info(sample._1()); THE_LOGGER.info("");
THE_LOGGER.info("--- testDataPath=" + testDataPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); }).count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy); context.close();
@SuppressWarnings("unchecked") @Test public void mapOnPairRDD() { JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1,2,3,4)); JavaPairRDD<Integer, Integer> rdd2 = rdd1.mapToPair(i -> new Tuple2<>(i, i % 2)); JavaPairRDD<Integer, Integer> rdd3 = rdd2.mapToPair(in -> new Tuple2<>(in._2(), in._1())); assertEquals(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(0, 2), new Tuple2<>(1, 3), new Tuple2<>(0, 4)), rdd3.collect()); }
@Test public void groupBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function<Integer, Boolean> isOdd = x -> x % 2 == 0; JavaPairRDD<Boolean, Iterable<Integer>> oddsAndEvens = rdd.groupBy(isOdd); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds oddsAndEvens = rdd.groupBy(isOdd, 1); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds }
@Test public void leftOuterJoin() { JavaPairRDD<Integer, Integer> rdd1 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 1), new Tuple2<>(1, 2), new Tuple2<>(2, 1), new Tuple2<>(3, 1) )); JavaPairRDD<Integer, Character> rdd2 = sc.parallelizePairs(Arrays.asList( new Tuple2<>(1, 'x'), new Tuple2<>(2, 'y'), new Tuple2<>(2, 'z'), new Tuple2<>(4, 'w') )); List<Tuple2<Integer, Tuple2<Integer, Optional<Character>>>> joined = rdd1.leftOuterJoin(rdd2).collect(); Assert.assertEquals(5, joined.size()); Tuple2<Integer, Tuple2<Integer, Optional<Character>>> firstUnmatched = rdd1.leftOuterJoin(rdd2).filter(tup -> !tup._2()._2().isPresent()).first(); Assert.assertEquals(3, firstUnmatched._1().intValue()); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
System.out.println("Properties: " + System.getProperties()); SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob"); sparkConf.setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4); dataSet = dataSet.filter(clientFilter); }).persist(StorageLevel.MEMORY_AND_DISK()); List<String> clientList = pairs.keys().distinct().collect(); Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() { JavaPairRDD<String, ActionData> filtered_by_client = pairs.filter(new Function<Tuple2<String, ActionData>, Boolean>() { JavaPairRDD<String, ActionData> nonZeroUserIds = filtered_by_client.filter(new Function<Tuple2<String, ActionData>, Boolean>() { JavaPairRDD<String, Integer> userIdLookupRDD = nonZeroUserIds.mapToPair(new PairFunction<Tuple2<String, ActionData>, String, Integer>() { Map<String, Integer> userIdLookupMap = userIdLookupRDD.collectAsMap(); Map<String, Integer> userIdLookupMap_wrapped = new HashMap<String, Integer>(userIdLookupMap); final Broadcast<Map<String, Integer>> broadcastVar = jsc.broadcast(userIdLookupMap_wrapped); JavaRDD<String> json_only_with_zeros = filtered_by_client.map(new Function<Tuple2<String, ActionData>, String>() {
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("basic log query"); JavaSparkContext sc = new JavaSparkContext(sparkConf); logs = sc.textFile(args[0]); logs = sc.parallelize(EXAMPLE_LOGS); logs.mapToPair((String logRecord) -> { String[] tokens = logRecord.split(","); Tuple3<String, String, String> key = Util.createKey(tokens); LogStatistics value = Util.createLogStatistics(tokens); return new Tuple2<Tuple3<String, String, String>, LogStatistics>(key, value); }); extracted.filter((Tuple2<Tuple3<String, String, String>, LogStatistics> s) -> { Tuple3<String, String, String> t3 = s._1; return (t3._1() != null); // exclude Tuple3(null,null,null) filtered.reduceByKey((LogStatistics stats, LogStatistics stats2) -> stats.merge(stats2)); List<Tuple2<Tuple3<String, String, String>, LogStatistics>> output = counts.collect(); for (Tuple2<?,?> t : output) { System.out.println(t._1() + "\t" + t._2());
@SuppressWarnings("unchecked") @Test public void hadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir, SequenceFileInputFormat.class, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }