static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) { final StatCounter summaryStats = rdd.stats(); final Double stddev = Math.sqrt(summaryStats.variance()); return rdd.filter(new Function<Double, Boolean>() { public Boolean call(Double x) { return (Math.abs(x - summaryStats.mean()) < 3 * stddev); }}); } }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]"); } String sparkMaster = args[0]; String cassandraHost = args[1]; SparkConf conf = new SparkConf(true) .set("spark.cassandra.connection.host", cassandraHost); JavaSparkContext sc = new JavaSparkContext( sparkMaster, "basicquerycassandra", conf); // entire table as an RDD // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int); JavaRDD<CassandraRow> data = javaFunctions(sc).cassandraTable("test" , "kv"); // print some basic stats System.out.println(data.mapToDouble(new DoubleFunction<CassandraRow>() { public double call(CassandraRow row) { return row.getInt("value"); }}).stats()); // write some basic data to Cassandra ArrayList<KeyValue> input = new ArrayList<KeyValue>(); input.add(KeyValue.newInstance("mostmagic", 3)); JavaRDD<KeyValue> kvRDD = sc.parallelize(input); javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv"); } public static class KeyValue implements Serializable {
/** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data. */ @Override protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary); JavaRDD<String> testData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary); return new Pair<>(newTrainData, testData); }
return Double.parseDouble(value); }}); final StatCounter stats = distanceDoubles.stats(); final Double stddev = stats.stdev(); final Double mean = stats.mean();
@Test public void javaDoubleRDD() { JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaDoubleRDD distinct = rdd.distinct(); assertEquals(5, distinct.count()); JavaDoubleRDD filter = rdd.filter(x -> x > 2.0); assertEquals(3, filter.count()); JavaDoubleRDD union = rdd.union(rdd); assertEquals(12, union.count()); union = union.cache(); assertEquals(12, union.count()); assertEquals(20, rdd.sum(), 0.01); StatCounter stats = rdd.stats(); assertEquals(20, stats.sum(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(6.22222, rdd.variance(), 0.01); assertEquals(rdd.variance(), rdd.popVariance(), 1e-14); assertEquals(7.46667, rdd.sampleVariance(), 0.01); assertEquals(2.49444, rdd.stdev(), 0.01); assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14); assertEquals(2.73252, rdd.sampleStdev(), 0.01); rdd.first(); rdd.take(5); }
@Test public void javaDoubleRDD() { JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaDoubleRDD distinct = rdd.distinct(); assertEquals(5, distinct.count()); JavaDoubleRDD filter = rdd.filter(x -> x > 2.0); assertEquals(3, filter.count()); JavaDoubleRDD union = rdd.union(rdd); assertEquals(12, union.count()); union = union.cache(); assertEquals(12, union.count()); assertEquals(20, rdd.sum(), 0.01); StatCounter stats = rdd.stats(); assertEquals(20, stats.sum(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(6.22222, rdd.variance(), 0.01); assertEquals(rdd.variance(), rdd.popVariance(), 1e-14); assertEquals(7.46667, rdd.sampleVariance(), 0.01); assertEquals(2.49444, rdd.stdev(), 0.01); assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14); assertEquals(2.73252, rdd.sampleStdev(), 0.01); rdd.first(); rdd.take(5); }
@Test public void javaDoubleRDD() { JavaDoubleRDD rdd = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaDoubleRDD distinct = rdd.distinct(); assertEquals(5, distinct.count()); JavaDoubleRDD filter = rdd.filter(x -> x > 2.0); assertEquals(3, filter.count()); JavaDoubleRDD union = rdd.union(rdd); assertEquals(12, union.count()); union = union.cache(); assertEquals(12, union.count()); assertEquals(20, rdd.sum(), 0.01); StatCounter stats = rdd.stats(); assertEquals(20, stats.sum(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(20/6.0, rdd.mean(), 0.01); assertEquals(6.22222, rdd.variance(), 0.01); assertEquals(rdd.variance(), rdd.popVariance(), 1e-14); assertEquals(7.46667, rdd.sampleVariance(), 0.01); assertEquals(2.49444, rdd.stdev(), 0.01); assertEquals(rdd.stdev(), rdd.popStdev(), 1e-14); assertEquals(2.73252, rdd.sampleStdev(), 0.01); rdd.first(); rdd.take(5); }
System.out.println(result.stats().toString());
public static void main(String[] args) { //Sample test data - All numbers from 1 to 99999 List<Double> testData = IntStream.range(1, 100000).mapToDouble(d -> d).collect(ArrayList::new, ArrayList::add, ArrayList::addAll); JavaDoubleRDD rdd = sc.parallelizeDoubles(testData); LOGGER.info("Mean: " + rdd.mean()); //For efficiency, use StatCounter if more than one stats are required. StatCounter statCounter = rdd.stats(); LOGGER.info("Using StatCounter"); LOGGER.info("Count: " + statCounter.count()); LOGGER.info("Min: " + statCounter.min()); LOGGER.info("Max: " + statCounter.max()); LOGGER.info("Sum: " + statCounter.sum()); LOGGER.info("Mean: " + statCounter.mean()); LOGGER.info("Variance: " + statCounter.variance()); LOGGER.info("Stdev: " + statCounter.stdev()); } }