Refine search
positiveData.mapToPair(rating -> new Tuple2<>(rating.user(), rating.product())); Broadcast<List<Integer>> allItemIDsBC = sparkContext.broadcast(positiveUserProducts.values().distinct().collect()); positiveUserProducts.groupByKey().flatMapToPair( new PairFlatMapFunction<Tuple2<Integer,Iterable<Integer>>,Integer,Integer>() { private final RandomGenerator random = RandomManager.getRandom(); }).mean();
@SuppressWarnings("unchecked") @Test public void intersection() { List<Integer> ints1 = Arrays.asList(1, 10, 2, 3, 4, 5); List<Integer> ints2 = Arrays.asList(1, 6, 2, 3, 7, 8); JavaRDD<Integer> s1 = sc.parallelize(ints1); JavaRDD<Integer> s2 = sc.parallelize(ints2); JavaRDD<Integer> intersections = s1.intersection(s2); assertEquals(3, intersections.count()); JavaRDD<Integer> empty = sc.emptyRDD(); JavaRDD<Integer> emptyIntersection = empty.intersection(s2); assertEquals(0, emptyIntersection.count()); List<Double> doubles = Arrays.asList(1.0, 2.0); JavaDoubleRDD d1 = sc.parallelizeDoubles(doubles); JavaDoubleRDD d2 = sc.parallelizeDoubles(doubles); JavaDoubleRDD dIntersection = d1.intersection(d2); assertEquals(2, dIntersection.count()); List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(1, 2)); pairs.add(new Tuple2<>(3, 4)); JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> pIntersection = p1.intersection(p2); assertEquals(2, pIntersection.count()); }
String outputDir = args[3]; JavaSparkContext sc = new JavaSparkContext( sparkMaster, "ChapterSixExample", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> rdd = sc.textFile(inputFile); final Accumulator<Integer> count = sc.accumulator(0); rdd.foreach(new VoidFunction<String>(){ public void call(String line) { if (line.contains("KK6JKQ")) { count.add(1); JavaRDD<String> callSigns = rdd.flatMap( new FlatMapFunction<String, String>() { public Iterable<String> call(String line) { if (line.equals("")) { callSigns.saveAsTextFile(outputDir + "/callsigns"); System.out.println("Blank lines: "+ blankLines.value()); return Double.parseDouble(value); }}); final StatCounter stats = distanceDoubles.stats(); final Double stddev = stats.stdev(); final Double mean = stats.mean(); JavaDoubleRDD reasonableDistances = distanceDoubles.filter(new Function<Double, Boolean>() { public Boolean call(Double x) { return (Math.abs(x-mean) < 3 * stddev);}}); System.out.println(StringUtils.join(reasonableDistances.collect(), ","));
static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) { final StatCounter summaryStats = rdd.stats(); final Double stddev = Math.sqrt(summaryStats.variance()); return rdd.filter(new Function<Double, Boolean>() { public Boolean call(Double x) { return (Math.abs(x - summaryStats.mean()) < 3 * stddev); }}); } }
public static final @Nullable Tuple4<Long, Long, Long, Long> contentSizeStats( JavaRDD<ApacheAccessLog> accessLogRDD) { JavaDoubleRDD contentSizes = accessLogRDD.mapToDouble(new GetContentSize()).cache(); long count = contentSizes.count(); if (count == 0) { return null; } Object ordering = Ordering.natural(); final Comparator<Double> cmp = (Comparator<Double>)ordering; return new Tuple4<>(count, contentSizes.reduce(new SumReducer()).longValue(), contentSizes.min(cmp).longValue(), contentSizes.max(cmp).longValue()); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@SuppressWarnings("unchecked") @Test public void sparkContextUnion() { // Union of non-specialized JavaRDDs List<String> strings = Arrays.asList("Hello", "World"); JavaRDD<String> s1 = sc.parallelize(strings); JavaRDD<String> s2 = sc.parallelize(strings); // Varargs JavaRDD<String> sUnion = sc.union(s1, s2); assertEquals(4, sUnion.count()); // List List<JavaRDD<String>> list = new ArrayList<>(); list.add(s2); sUnion = sc.union(s1, list); assertEquals(4, sUnion.count()); // Union of JavaDoubleRDDs List<Double> doubles = Arrays.asList(1.0, 2.0); JavaDoubleRDD d1 = sc.parallelizeDoubles(doubles); JavaDoubleRDD d2 = sc.parallelizeDoubles(doubles); JavaDoubleRDD dUnion = sc.union(d1, d2); assertEquals(4, dUnion.count()); // Union of JavaPairRDDs List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(1, 2)); pairs.add(new Tuple2<>(3, 4)); JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> pUnion = sc.union(p1, p2); assertEquals(4, pUnion.count()); }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]"); } String sparkMaster = args[0]; String cassandraHost = args[1]; SparkConf conf = new SparkConf(true) .set("spark.cassandra.connection.host", cassandraHost); JavaSparkContext sc = new JavaSparkContext( sparkMaster, "basicquerycassandra", conf); // entire table as an RDD // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int); JavaRDD<CassandraRow> data = javaFunctions(sc).cassandraTable("test" , "kv"); // print some basic stats System.out.println(data.mapToDouble(new DoubleFunction<CassandraRow>() { public double call(CassandraRow row) { return row.getInt("value"); }}).stats()); // write some basic data to Cassandra ArrayList<KeyValue> input = new ArrayList<KeyValue>(); input.add(KeyValue.newInstance("mostmagic", 3)); JavaRDD<KeyValue> kvRDD = sc.parallelize(input); javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv"); } public static class KeyValue implements Serializable {
public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmaptodouble", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaDoubleRDD result = rdd.mapToDouble( new DoubleFunction<Integer>() { public double call(Integer x) { double y = (double) x; return y * y; } }); System.out.println(StringUtils.join(result.collect(), ",")); } }
final LinearRegressionModel model = LinearRegressionModel.load(context.sc(), savedModelPath); JavaRDD<String> testRDD = context.textFile(testDataPath); JavaRDD<Tuple2<Double, Double>> valuesAndPreds = test.map( new Function<LabeledPoint, Tuple2<Double, Double>>() { @Override double MSE = new JavaDoubleRDD(valuesAndPreds.map( new Function<Tuple2<Double, Double>, Object>() { @Override ).rdd()).mean(); context.close();
data.cache(); JavaRDD<List<Writable>> fmSeq = data.flatMap(new SequenceFlatMapFunction()); DataAnalysis da = analyze(schema, fmSeq); JavaRDD<Integer> seqLengths = data.map(new SequenceLengthFunction()); seqLengths.cache(); SequenceLengthAnalysisCounter counter = new SequenceLengthAnalysisCounter(); } else if (nBuckets < maxHistogramBuckets) { JavaDoubleRDD drdd = seqLengths.mapToDouble(new IntToDoubleFunction()); hist = drdd.histogram(nBuckets); } else { JavaDoubleRDD drdd = seqLengths.mapToDouble(new IntToDoubleFunction()); hist = drdd.histogram(maxHistogramBuckets); seqLengths.unpersist();
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
null : new AgeClassifyModelWrapper(classifyModel); JavaRDD<String> data = spark.sparkContext().textFile(dataIn,8).toJavaRDD().cache(); JavaRDD<Row> samples = data.map( new Function<String, Row>() { public Row call(String s) throws IOException{ JavaRDD<Row> validSamples = samples.filter( new Function<Row, Boolean>() { @Override }).cache(); double MAE = new JavaDoubleRDD(valuesAndPreds.map( new Function<Tuple2<Double, Double>, Object>() { public Object call(Tuple2<Double, Double> pair) { return Math.abs(pair._1() - pair._2()); }).rdd()).mean();
.cache(); JavaRDD<Row> samples = data.map( new Function<String, Row>() { public Row call(String s) { }).cache(); JavaRDD<Row> validSamples = samples.filter( }).cache(); double MAE = new JavaDoubleRDD(valuesAndPreds.map( new Function<Tuple2<Double, Double>, Object>() { public Object call(Tuple2<Double, Double> pair) { return Math.abs(pair._1() - pair._2()); }).rdd()).mean(); JavaRDD<Vector> vectors = valuesAndPreds.map(
JavaSparkContext jsc = new JavaSparkContext(conf); JavaDoubleRDD javaDoubleRDD = infinispanRDD.values().mapToDouble(Temperature::getValue); Double meanTemp = javaDoubleRDD.mean(); System.out.printf("\nAVERAGE TEMPERATURE: %f C\n", meanTemp); Double stdDev = javaDoubleRDD.sampleStdev(); System.out.printf("STD DEVIATION: %f C\n ", stdDev); long[] histogram = javaDoubleRDD.histogram(buckets);
public static void main(String[] args) { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaDoubleRDD input = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 1000.0)); JavaDoubleRDD result = removeOutliers(input); System.out.println(StringUtils.join(result.collect(), ",")); } static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) {
@Test public void javaDoubleRDD() { JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaDoubleRDD distinct = rdd.distinct(); assertEquals(5, distinct.count()); JavaDoubleRDD filter = rdd.filter(x -> x > 2.0); assertEquals(3, filter.count()); JavaDoubleRDD union = rdd.union(rdd); assertEquals(12, union.count()); union = union.cache(); assertEquals(12, union.count()); assertEquals(20, rdd.sum(), 0.01); StatCounter stats = rdd.stats(); assertEquals(20, stats.sum(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(6.22222, rdd.variance(), 0.01); assertEquals(rdd.variance(), rdd.popVariance(), 1e-14); assertEquals(7.46667, rdd.sampleVariance(), 0.01); assertEquals(2.49444, rdd.stdev(), 0.01); assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14); assertEquals(2.73252, rdd.sampleStdev(), 0.01); rdd.first(); rdd.take(5); }
public static void main(String[] args) { fillHazelcastMapWithUsers(); SparkConf conf = new SparkConf() .setMaster("local[2]") .setAppName("Create RDD From Hazelcast") .set("hazelcast.server.addresses", "127.0.0.1:5701") .set("spark.driver.host", "127.0.0.1"); JavaSparkContext sparkContext = new JavaSparkContext(conf); HazelcastSparkContext hazelcastSparkContext = new HazelcastSparkContext(sparkContext); HazelcastJavaRDD<String, User> usersRdd = hazelcastSparkContext.fromHazelcastMap("users"); Double averageAge = usersRdd.flatMapToDouble( new DoubleFlatMapFunction<Tuple2<String, User>>() { @Override public Iterator<Double> call(Tuple2<String, User> entry) throws Exception { return singletonList((double) entry._2().getAge()).iterator(); } } ).mean(); System.out.println("Average user age = " + averageAge); }
/** * @param evalData data for evaluation * @return the Dunn Index of a given clustering * (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better */ @Override double evaluate(JavaRDD<Vector> evalData) { // Intra-cluster distance is mean distance to centroid double maxIntraClusterDistance = fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max(); // Inter-cluster distance is distance between centroids double minInterClusterDistance = Double.POSITIVE_INFINITY; List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values()); DistanceFn<double[]> distanceFn = getDistanceFn(); for (int i = 0; i < clusters.size(); i++) { double[] centerI = clusters.get(i).getCenter(); // Distances are symmetric, hence d(i,j) == d(j,i) for (int j = i + 1; j < clusters.size(); j++) { double[] centerJ = clusters.get(j).getCenter(); minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ)); } } return minInterClusterDistance / maxIntraClusterDistance; }
/** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data. */ @Override protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary); JavaRDD<String> testData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary); return new Pair<>(newTrainData, testData); }