public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmaptodouble", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaDoubleRDD result = rdd.mapToDouble( new DoubleFunction<Integer>() { public double call(Integer x) { double y = (double) x; return y * y; } }); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Override double evaluate(JavaRDD<Vector> evalData) { return fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getSumSquaredDist).sum(); }
public static final @Nullable Tuple4<Long, Long, Long, Long> contentSizeStats( JavaRDD<ApacheAccessLog> accessLogRDD) { JavaDoubleRDD contentSizes = accessLogRDD.mapToDouble(new GetContentSize()).cache(); long count = contentSizes.count(); if (count == 0) { return null; } Object ordering = Ordering.natural(); final Comparator<Double> cmp = (Comparator<Double>)ordering; return new Tuple4<>(count, contentSizes.reduce(new SumReducer()).longValue(), contentSizes.min(cmp).longValue(), contentSizes.max(cmp).longValue()); }
static double rmse(DecisionForest forest, JavaRDD<Example> examples) { double mse = examples.mapToDouble(example -> { NumericPrediction prediction = (NumericPrediction) forest.predict(example); NumericFeature target = (NumericFeature) example.getTarget(); double diff = prediction.getPrediction() - target.getValue(); return diff * diff; }).mean(); return Math.sqrt(mse); }
/** * @param evalData data for evaluation * @return the Dunn Index of a given clustering * (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better */ @Override double evaluate(JavaRDD<Vector> evalData) { // Intra-cluster distance is mean distance to centroid double maxIntraClusterDistance = fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max(); // Inter-cluster distance is distance between centroids double minInterClusterDistance = Double.POSITIVE_INFINITY; List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values()); DistanceFn<double[]> distanceFn = getDistanceFn(); for (int i = 0; i < clusters.size(); i++) { double[] centerI = clusters.get(i).getCenter(); // Distances are symmetric, hence d(i,j) == d(j,i) for (int j = i + 1; j < clusters.size(); j++) { double[] centerJ = clusters.get(j).getCenter(); minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ)); } } return minInterClusterDistance / maxIntraClusterDistance; }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]"); } String sparkMaster = args[0]; String cassandraHost = args[1]; SparkConf conf = new SparkConf(true) .set("spark.cassandra.connection.host", cassandraHost); JavaSparkContext sc = new JavaSparkContext( sparkMaster, "basicquerycassandra", conf); // entire table as an RDD // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int); JavaRDD<CassandraRow> data = javaFunctions(sc).cassandraTable("test" , "kv"); // print some basic stats System.out.println(data.mapToDouble(new DoubleFunction<CassandraRow>() { public double call(CassandraRow row) { return row.getInt("value"); }}).stats()); // write some basic data to Cassandra ArrayList<KeyValue> input = new ArrayList<KeyValue>(); input.add(KeyValue.newInstance("mostmagic", 3)); JavaRDD<KeyValue> kvRDD = sc.parallelize(input); javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv"); } public static class KeyValue implements Serializable {
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
/** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data. */ @Override protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary); JavaRDD<String> testData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary); return new Pair<>(newTrainData, testData); }
@Test public void zip() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x); JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles); zipped.count(); }
@Test public void zip() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue); JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles); zipped.count(); }
@Test public void zip() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x); JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles); zipped.count(); }
@Test public void zip() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x); JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles); zipped.count(); }
@Test public void zip() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue); JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles); zipped.count(); }
@Test public void zip() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue); JavaPairRDD<Integer, Double> zipped = rdd.zip(doubles); zipped.count(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }