@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
System.out.println("Properties: " + System.getProperties()); SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob"); sparkConf.setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4); dataSet = dataSet.filter(clientFilter); JavaPairRDD<String, ActionData> pairs = dataSet.mapToPair(new PairFunction<String, String, ActionData>() { }).persist(StorageLevel.MEMORY_AND_DISK()); List<String> clientList = pairs.keys().distinct().collect(); Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() { JavaRDD<String> json_only = json_only_with_zeros.filter(new Function<String, Boolean>() { json_only.saveAsTextFile(outputPath, org.apache.hadoop.io.compress.GzipCodec.class); } else { json_only.saveAsTextFile(outputPath); long json_only_count = json_only.count(); clientDetailZeroQueue.add(new ClientDetail(currentClient, json_only_with_zeros.count() - json_only_count)); clientDetailQueue.add(new ClientDetail(currentClient, json_only_count));
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
/** * @param trainPointData data to cluster * @param model trained KMeans Model * @return map of ClusterId, count of points associated with the clusterId */ private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData, KMeansModel model) { return trainPointData.map(model::predict).countByValue(); }
private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD, boolean user) { int offset = user ? 0 : 1; Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset]) .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions()) .zipWithIndex().mapValues(Long::intValue) .collectAsMap(); // Clone, due to some serialization problems with the result of collectAsMap? return new HashMap<>(reverseIDLookup); }
THE_LOGGER.info("queryInputPath=" + queryInputPath); THE_LOGGER.info("savedModelPath=" + savedModelPath); JavaRDD<String> query = context.textFile(queryInputPath); final LogisticRegressionModel model = LogisticRegressionModel.load(context.sc(), savedModelPath); JavaPairRDD<String, Double> classifications = query.mapToPair( new PairFunction<String, String, Double>() { @Override THE_LOGGER.info("classification="+classification); return new Tuple2<String, Double>(patientID, classification); Iterable<Tuple2<String, Double>> predictions = classifications.collect(); for (Tuple2<String, Double> pair : predictions) { THE_LOGGER.info("query: patientID="+pair._1); THE_LOGGER.info("prediction="+pair._2); context.stop();
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- queryDataPath=" + queryDataPath); JavaRDD<String> queryRDD = context.textFile(queryDataPath); JavaRDD<Vector> query = Util.createFeatureVector(queryRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); = query.mapToPair((Vector v) -> { return new Tuple2<Vector, Double>(v, prediction); }); Iterable<Tuple2<Vector, Double>> predictions = predictionAndLabel.collect(); for (Tuple2<Vector, Double> p : predictions) { THE_LOGGER.info("input: " + p._1); THE_LOGGER.info("prediction: " + p._2); context.close();
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- testDataPath=" + testDataPath); SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel"); JavaSparkContext context = new JavaSparkContext(sparkConf); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair((LabeledPoint p) -> new Tuple2<Double, Double>(model.predict(p.features()), p.label())); double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2())) .count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy);
THE_LOGGER.info("--- trainingPath=" + trainingPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> trainingRDD = context.textFile(trainingPath); JavaRDD<LabeledPoint> training = Util.createLabeledPointRDD(trainingRDD); final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); model.save(context.sc(), savedModelPath); context.close();
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- testDataPath=" + testDataPath); SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel"); JavaSparkContext context = new JavaSparkContext(sparkConf); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); }).count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy);
THE_LOGGER.info("--- testDataPath=" + testDataPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); }).count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy); context.close();
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("basic log query"); JavaSparkContext sc = new JavaSparkContext(sparkConf); logs = sc.textFile(args[0]); logs = sc.parallelize(EXAMPLE_LOGS); logs.mapToPair(new PairFunction<String, Tuple3<String, String, String>, LogStatistics>() { @Override public Tuple2<Tuple3<String, String, String>, LogStatistics> call(String logRecord) { extracted.filter(new Function< Tuple2<Tuple3<String, String, String>, LogStatistics>, Boolean filtered.reduceByKey(new Function2<LogStatistics, LogStatistics, LogStatistics>() { @Override public LogStatistics call(LogStatistics stats, LogStatistics stats2) { List<Tuple2<Tuple3<String, String, String>, LogStatistics>> output = counts.collect(); for (Tuple2<?,?> t : output) { System.out.println(t._1() + "\t" + t._2());
@Test public void flatMap() { JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello World!", "The quick brown fox jumps over the lazy dog.")); JavaRDD<String> words = rdd.flatMap(x -> Arrays.asList(x.split(" ")).iterator()); Assert.assertEquals("Hello", words.first()); Assert.assertEquals(11, words.count()); JavaPairRDD<String, String> pairs = rdd.flatMapToPair(s -> { List<Tuple2<String, String>> pairs2 = new LinkedList<>(); for (String word : s.split(" ")) { pairs2.add(new Tuple2<>(word, word)); } return pairs2.iterator(); }); Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairs.first()); Assert.assertEquals(11, pairs.count()); JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> { List<Double> lengths = new LinkedList<>(); for (String word : s.split(" ")) { lengths.add((double) word.length()); } return lengths.iterator(); }); Assert.assertEquals(5.0, doubles.first(), 0.01); Assert.assertEquals(11, pairs.count()); }
final Accumulator<Integer> count = sc.accumulator(0); rdd.foreach(new VoidFunction<String>(){ public void call(String line) { if (line.contains("KK6JKQ")) { count.add(1); JavaRDD<String> callSigns = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String line) { if (line.equals("")) { callSigns.saveAsTextFile(outputDir + "/callsigns"); System.out.println("Blank lines: "+ blankLines.value()); JavaRDD<String> validCallSigns = callSigns.filter( new Function<String, Boolean>(){ public Boolean call(String callSign) { Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z"); JavaPairRDD<String, Integer> contactCounts = validCallSigns.mapToPair( new PairFunction<String, String, Integer>() { public Tuple2<String, Integer> call(String callSign) { return new Tuple2(callSign, 1); }}).reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer x, Integer y) { return x + y; }}); JavaPairRDD<String, CallLog[]> contactsContactLists = validCallSigns.mapPartitionsToPair( new PairFlatMapFunction<Iterator<String>, String, CallLog[]>() { public Iterable<Tuple2<String, CallLog[]>> call(Iterator<String> input) {
SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); totalCount = encodedBaseRDD.count(); allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);
SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> stringData = sc.textFile(filename); JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr)); List<String> inputDataCollected = stringData.collect(); System.out.println("\n\n---- Original Data ----"); for(String s : inputDataCollected) System.out.println("'" + s + "'"); JavaRDD<String> processedAsString = parsedInputData.map(new WritablesToStringFunction(",")); List<String> inputDataParsed = processedAsString.collect(); System.out.println("\n\n---- Parsed Data ----"); for(String s : inputDataParsed) System.out.println("'" + s + "'"); processedAsString = processedData.map(new WritablesToStringFunction(",")); inputDataParsed = processedAsString.collect(); System.out.println("\n\n---- Parsed and filtered data ----"); for(String s : inputDataParsed) System.out.println(s);
Class.forName("scala.collection.mutable.WrappedArray$ofRef") }; SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { KylinSparkJobListener jobListener = new KylinSparkJobListener(); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1); JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> spam = sc.textFile("files/spam.txt"); JavaRDD<String> ham = sc.textFile("files/ham.txt"); JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() { @Override public LabeledPoint call(String email) { return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() { @Override public LabeledPoint call(String email) { return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples); trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm. LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
@Test public void collectAsMapWithIntArrayValues() { // Regression test for SPARK-1040 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1)); JavaPairRDD<Integer, int[]> pairRDD = rdd.mapToPair(x -> new Tuple2<>(x, new int[]{x})); pairRDD.collect(); // Works fine pairRDD.collectAsMap(); // Used to crash with ClassCastException }