System.out.println("Properties: " + System.getProperties()); SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob"); sparkConf.setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); Configuration hadoopConf = jsc.hadoopConfiguration(); hadoopConf.set("fs.s3.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); if (cmdLineArgs.aws_access_key_id != null && !"".equals(cmdLineArgs.aws_access_key_id)) JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4); dataSet = dataSet.filter(clientFilter); JavaPairRDD<String, ActionData> pairs = dataSet.mapToPair(new PairFunction<String, String, ActionData>() { }).persist(StorageLevel.MEMORY_AND_DISK()); Map<String, Integer> userIdLookupMap = userIdLookupRDD.collectAsMap(); Map<String, Integer> userIdLookupMap_wrapped = new HashMap<String, Integer>(userIdLookupMap); final Broadcast<Map<String, Integer>> broadcastVar = jsc.broadcast(userIdLookupMap_wrapped); JavaRDD<String> json_only_with_zeros = filtered_by_client.map(new Function<Tuple2<String, ActionData>, String>() { jsc.stop(); System.out.println(String.format("--- finished GroupActionsJob date[%s] unixDays[%s] ---", cmdLineArgs.input_date_string, unixDays));
Class.forName("scala.collection.mutable.WrappedArray$ofRef") }; SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); try (JavaSparkContext sc = new JavaSparkContext(conf)) { KylinSparkJobListener jobListener = new KylinSparkJobListener(); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); final KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1); JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName, metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
public void run(String master, String csv1, String csv2) throws Exception { JavaSparkContext sc = new JavaSparkContext( master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> csvFile1 = sc.textFile(csv1); JavaRDD<String> csvFile2 = sc.textFile(csv2); JavaPairRDD<Integer, String[]> keyedRDD1 = csvFile1.mapToPair(new ParseLine()); JavaPairRDD<Integer, String[]> keyedRDD2 = csvFile1.mapToPair(new ParseLine()); JavaPairRDD<Integer, Tuple2<String[], String[]>> result = keyedRDD1.join(keyedRDD2); List<Tuple2<Integer, Tuple2<String[], String[]>>> resultCollection = result.collect(); } }
public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); Integer result = rdd.fold(0, new Function2<Integer, Integer, Integer>() { public Integer call(Integer x, Integer y) { return x + y;}}); System.out.println(result); } }
public static void main(String[] args) throws Exception { String master = args[0]; JavaSparkContext sc = new JavaSparkContext( master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> rdd = sc.textFile(args[1]); JavaPairRDD<String, Integer> counts = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String x) { return Arrays.asList(x.split(" ")); }}).mapToPair(new PairFunction<String, String, Integer>(){ public Tuple2<String, Integer> call(String x){ return new Tuple2(x, 1); }}).reduceByKey(new Function2<Integer, Integer, Integer>(){ public Integer call(Integer x, Integer y){ return x+y;}}); counts.saveAsTextFile(args[2]); } }
THE_LOGGER.info("queryInputPath=" + queryInputPath); THE_LOGGER.info("savedModelPath=" + savedModelPath); JavaRDD<String> query = context.textFile(queryInputPath); final LogisticRegressionModel model = LogisticRegressionModel.load(context.sc(), savedModelPath); JavaPairRDD<String, Double> classifications = query.mapToPair( new PairFunction<String, String, Double>() { @Override THE_LOGGER.info("classification="+classification); return new Tuple2<String, Double>(patientID, classification); Iterable<Tuple2<String, Double>> predictions = classifications.collect(); for (Tuple2<String, Double> pair : predictions) { THE_LOGGER.info("query: patientID="+pair._1); THE_LOGGER.info("prediction="+pair._2); context.stop();
THE_LOGGER.info("--- queryDataPath=" + queryDataPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> queryRDD = context.textFile(queryDataPath); JavaRDD<Vector> query = Util.createFeatureVector(queryRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); = query.mapToPair(new PairFunction<Vector, Vector, Double>() { @Override public Tuple2<Vector, Double> call(Vector v) { Iterable<Tuple2<Vector, Double>> predictions = predictionAndLabel.collect(); for (Tuple2<Vector, Double> p : predictions) { THE_LOGGER.info("input: "+ p._1); THE_LOGGER.info("prediction: "+ p._2); context.close();
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- testDataPath=" + testDataPath); SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel"); JavaSparkContext context = new JavaSparkContext(sparkConf); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair((LabeledPoint p) -> new Tuple2<Double, Double>(model.predict(p.features()), p.label())); double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2())) .count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy); context.close();
THE_LOGGER.info("--- trainingPath=" + trainingPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> trainingRDD = context.textFile(trainingPath); JavaRDD<LabeledPoint> training = Util.createLabeledPointRDD(trainingRDD); final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); model.save(context.sc(), savedModelPath); context.close();
THE_LOGGER.info("--- savedModelPath=" + savedModelPath); THE_LOGGER.info("--- testDataPath=" + testDataPath); SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel"); JavaSparkContext context = new JavaSparkContext(sparkConf); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); }).count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy); context.close();
public static void main(String[] args) throws Exception { JavaSparkContext context = new JavaSparkContext(); THE_LOGGER.info("Number of data records " + data.count()); THE_LOGGER.info("Done selecting initial centroids: " + centroids.size()); JavaPairRDD<Integer, Vector> closest = getClosest(data, centroids); JavaPairRDD<Integer, Iterable<Vector>> pointsGroup = closest.groupByKey(); Map<Integer, Vector> newCentroids = getNewCentroids(pointsGroup); centroids.set(t.getKey(), t.getValue()); THE_LOGGER.info("Finished iteration (delta = " + tempDist + ")"); for (int i = 0; i < centroids.size(); i++) { final int index = i; List<Tuple2<String, Vector>> samples = data.filter(new Function<Tuple2<String, Vector>, Boolean>() { @Override public Boolean call(Tuple2<String, Vector> in) throws Exception { THE_LOGGER.info(sample._1()); THE_LOGGER.info(""); context.stop(); System.exit(0);
THE_LOGGER.info("--- testDataPath=" + testDataPath); THE_LOGGER.info("--- savedModelPath=" + savedModelPath); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<LabeledPoint> test = Util.createLabeledPointRDD(testRDD); final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath); test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return pl._1().equals(pl._2()); }).count() / (double) test.count(); THE_LOGGER.info("accuracy="+accuracy); context.close();
@Test public void groupByOnPairRDD() { // Regression test for SPARK-4459 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function<Tuple2<Integer, Integer>, Boolean> areOdd = x -> (x._1() % 2 == 0) && (x._2() % 2 == 0); JavaPairRDD<Integer, Integer> pairRDD = rdd.zip(rdd); JavaPairRDD<Boolean, Iterable<Tuple2<Integer, Integer>>> oddsAndEvens = pairRDD.groupBy(areOdd); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds oddsAndEvens = pairRDD.groupBy(areOdd, 1); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds }
@Test public void groupBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 1, 2, 3, 5, 8, 13)); Function<Integer, Boolean> isOdd = x -> x % 2 == 0; JavaPairRDD<Boolean, Iterable<Integer>> oddsAndEvens = rdd.groupBy(isOdd); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds oddsAndEvens = rdd.groupBy(isOdd, 1); assertEquals(2, oddsAndEvens.count()); assertEquals(2, Iterables.size(oddsAndEvens.lookup(true).get(0))); // Evens assertEquals(5, Iterables.size(oddsAndEvens.lookup(false).get(0))); // Odds }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
String outputDir = args[3]; JavaSparkContext sc = new JavaSparkContext( sparkMaster, "ChapterSixExample", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> rdd = sc.textFile(inputFile); final Accumulator<Integer> count = sc.accumulator(0); rdd.foreach(new VoidFunction<String>(){ public void call(String line) { if (line.contains("KK6JKQ")) { count.add(1); final Accumulator<Integer> blankLines = sc.accumulator(0); JavaRDD<String> callSigns = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String line) { if (line.equals("")) { final Accumulator<Integer> validSignCount = sc.accumulator(0); final Accumulator<Integer> invalidSignCount = sc.accumulator(0); final Broadcast<String[]> signPrefixes = sc.broadcast(loadCallSignTable()); sc.addFile(distScript); return (Math.abs(x-mean) < 3 * stddev);}}); System.out.println(StringUtils.join(reasonableDistances.collect(), ",")); sc.stop(); System.exit(0);
SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); JavaSparkContext sc = new JavaSparkContext(conf); sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf)); totalCount = encodedBaseRDD.count(); allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);
SparkConf conf = new SparkConf().setMaster(master).setAppName("basicavgwithkyro"); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", AvgRegistrator.class.getName()); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() { @Override AvgCount result = rdd.aggregate(initial, addAndCount, combine); System.out.println(result.avg());
positiveData.mapToPair(rating -> new Tuple2<>(rating.user(), rating.product())); Broadcast<List<Integer>> allItemIDsBC = sparkContext.broadcast(positiveUserProducts.values().distinct().collect()); positiveUserProducts.groupByKey().flatMapToPair( new PairFlatMapFunction<Tuple2<Integer,Iterable<Integer>>,Integer,Integer>() { private final RandomGenerator random = RandomManager.getRandom(); for (Rating positive : t._1()) { for (Rating negative : t._2()) { if (positive.rating() > negative.rating()) { correct++;
SparkConf conf = new SparkConf(); conf.setMaster("local[*]"); conf.setAppName("DataVec Example"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> stringData = sc.textFile(filename); JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr)); List<String> inputDataCollected = stringData.collect(); System.out.println("\n\n---- Original Data ----"); for(String s : inputDataCollected) System.out.println("'" + s + "'");