public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> result = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@Test public void countApproxDistinct() { List<Integer> arrayData = new ArrayList<>(); int size = 100; for (int i = 0; i < 100000; i++) { arrayData.add(i % size); } JavaRDD<Integer> simpleRdd = sc.parallelize(arrayData, 10); assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.1); }
@Test public void mapPartitionsWithIndex() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitionsWithIndex((index, iter) -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }, false); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@Test public void countApproxDistinct() { List<Integer> arrayData = new ArrayList<>(); int size = 100; for (int i = 0; i < 100000; i++) { arrayData.add(i % size); } JavaRDD<Integer> simpleRdd = sc.parallelize(arrayData, 10); assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.1); }
public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> squared = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); JavaRDD<Integer> result = squared.filter( new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }}); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); Assert.assertEquals("[3, 7]", partitionSums.collect().toString()); }
@Test public void countApproxDistinct() { List<Integer> arrayData = new ArrayList<>(); int size = 100; for (int i = 0; i < 100000; i++) { arrayData.add(i % size); } JavaRDD<Integer> simpleRdd = sc.parallelize(arrayData, 10); assertTrue(Math.abs((simpleRdd.countApproxDistinct(0.05) - size) / (size * 1.0)) <= 0.1); }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@Test public void runRidgeRegressionUsingStaticMethods() { int numExamples = 50; int numFeatures = 20; List<LabeledPoint> data = generateRidgeData(2 * numExamples, numFeatures, 10.0); JavaRDD<LabeledPoint> testRDD = jsc.parallelize(data.subList(0, numExamples)); List<LabeledPoint> validationData = data.subList(numExamples, 2 * numExamples); RidgeRegressionModel model = RidgeRegressionWithSGD.train(testRDD.rdd(), 200, 1.0, 0.0); double unRegularizedErr = predictionError(validationData, model); model = RidgeRegressionWithSGD.train(testRDD.rdd(), 200, 1.0, 0.1); double regularizedErr = predictionError(validationData, model); Assert.assertTrue(regularizedErr < unRegularizedErr); } }
@Test public void mapPartitionsWithIndex() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitionsWithIndex((index, iter) -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }, false); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@Test public void runRidgeRegressionUsingStaticMethods() { int numExamples = 50; int numFeatures = 20; List<LabeledPoint> data = generateRidgeData(2 * numExamples, numFeatures, 10.0); JavaRDD<LabeledPoint> testRDD = jsc.parallelize(data.subList(0, numExamples)); List<LabeledPoint> validationData = data.subList(numExamples, 2 * numExamples); RidgeRegressionModel model = RidgeRegressionWithSGD.train(testRDD.rdd(), 200, 1.0, 0.0); double unRegularizedErr = predictionError(validationData, model); model = RidgeRegressionWithSGD.train(testRDD.rdd(), 200, 1.0, 0.1); double regularizedErr = predictionError(validationData, model); Assert.assertTrue(regularizedErr < unRegularizedErr); } }
@Test public void toLocalIterator() { List<Integer> correct = Arrays.asList(1, 2, 3, 4); JavaRDD<Integer> rdd = sc.parallelize(correct); List<Integer> result = Lists.newArrayList(rdd.toLocalIterator()); assertEquals(correct, result); }
@Test public void runLassoUsingStaticMethods() { int nPoints = 10000; double A = 0.0; double[] weights = {-1.5, 1.0e-2}; JavaRDD<LabeledPoint> testRDD = jsc.parallelize(LinearDataGenerator.generateLinearInputAsList(A, weights, nPoints, 42, 0.1), 2).cache(); List<LabeledPoint> validationData = LinearDataGenerator.generateLinearInputAsList(A, weights, nPoints, 17, 0.1); LassoModel model = LassoWithSGD.train(testRDD.rdd(), 100, 1.0, 0.01, 1.0); int numAccurate = validatePrediction(validationData, model); Assert.assertTrue(numAccurate > nPoints * 4.0 / 5.0); }
@Test public void treeReduce() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(-5, -4, -3, -2, -1, 1, 2, 3, 4), 10); Function2<Integer, Integer, Integer> add = (a, b) -> a + b; for (int depth = 1; depth <= 10; depth++) { int sum = rdd.treeReduce(add, depth); assertEquals(-5, sum); } }
@Test public void top() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); List<Integer> top2 = rdd.top(2); assertEquals(Arrays.asList(4, 3), top2); }
@Test public void cartesian() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); JavaRDD<String> stringRDD = sc.parallelize(Arrays.asList("Hello", "World")); JavaPairRDD<String, Double> cartesian = stringRDD.cartesian(doubleRDD); assertEquals(new Tuple2<>("Hello", 1.0), cartesian.first()); }
@Test public void foreachWithAnonymousClass() { foreachCalls = 0; JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello", "World")); rdd.foreach(s -> foreachCalls++); Assert.assertEquals(2, foreachCalls); }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); Assert.assertEquals("[3, 7]", partitionSums.collect().toString()); }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); Assert.assertEquals("[3, 7]", partitionSums.collect().toString()); }