public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> result = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@SuppressWarnings("unchecked") @Test public void objectFilesOfComplexTypes() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.saveAsObjectFile(outputDir); // Try reading the output back as an object file JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir); assertEquals(pairs, readRDD.collect()); }
@Test public void binaryRecords() throws Exception { // Reusing the wholeText files example byte[] content1 = "spark isn't always easy to use.\n".getBytes(StandardCharsets.UTF_8); int numOfCopies = 10; String tempDirName = tempDir.getAbsolutePath(); File file1 = new File(tempDirName + "/part-00000"); FileOutputStream fos1 = new FileOutputStream(file1); FileChannel channel1 = fos1.getChannel(); for (int i = 0; i < numOfCopies; i++) { ByteBuffer bbuf = ByteBuffer.wrap(content1); channel1.write(bbuf); } channel1.close(); JavaRDD<byte[]> readRDD = sc.binaryRecords(tempDirName, content1.length); assertEquals(numOfCopies,readRDD.count()); List<byte[]> result = readRDD.collect(); for (byte[] res : result) { assertArrayEquals(content1, res); } }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void mapsFromPairsToPairs() { List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); // Regression test for SPARK-668: JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator()); swapped.collect(); // There was never a bug here, but it's worth testing: pairRDD.map(Tuple2::swap).collect(); }
@Test public void mapPartitionsWithIndex() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitionsWithIndex((index, iter) -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }, false); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@SuppressWarnings("unchecked") @Test public void objectFilesOfComplexTypes() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.saveAsObjectFile(outputDir); // Try reading the output back as an object file JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir); assertEquals(pairs, readRDD.collect()); }
@Test public void binaryRecords() throws Exception { // Reusing the wholeText files example byte[] content1 = "spark isn't always easy to use.\n".getBytes(StandardCharsets.UTF_8); int numOfCopies = 10; String tempDirName = tempDir.getAbsolutePath(); File file1 = new File(tempDirName + "/part-00000"); FileOutputStream fos1 = new FileOutputStream(file1); FileChannel channel1 = fos1.getChannel(); for (int i = 0; i < numOfCopies; i++) { ByteBuffer bbuf = ByteBuffer.wrap(content1); channel1.write(bbuf); } channel1.close(); JavaRDD<byte[]> readRDD = sc.binaryRecords(tempDirName, content1.length); assertEquals(numOfCopies,readRDD.count()); List<byte[]> result = readRDD.collect(); for (byte[] res : result) { assertArrayEquals(content1, res); } }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void mapsFromPairsToPairs() { List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); // Regression test for SPARK-668: JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator()); swapped.collect(); // There was never a bug here, but it's worth testing: pairRDD.map(Tuple2::swap).collect(); }
public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> squared = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); JavaRDD<Integer> result = squared.filter( new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }}); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); Assert.assertEquals("[3, 7]", partitionSums.collect().toString()); }
@SuppressWarnings("unchecked") @Test public void objectFilesOfComplexTypes() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.saveAsObjectFile(outputDir); // Try reading the output back as an object file JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir); assertEquals(pairs, readRDD.collect()); }
@Test public void binaryRecords() throws Exception { // Reusing the wholeText files example byte[] content1 = "spark isn't always easy to use.\n".getBytes(StandardCharsets.UTF_8); int numOfCopies = 10; String tempDirName = tempDir.getAbsolutePath(); File file1 = new File(tempDirName + "/part-00000"); FileOutputStream fos1 = new FileOutputStream(file1); FileChannel channel1 = fos1.getChannel(); for (int i = 0; i < numOfCopies; i++) { ByteBuffer bbuf = ByteBuffer.wrap(content1); channel1.write(bbuf); } channel1.close(); JavaRDD<byte[]> readRDD = sc.binaryRecords(tempDirName, content1.length); assertEquals(numOfCopies,readRDD.count()); List<byte[]> result = readRDD.collect(); for (byte[] res : result) { assertArrayEquals(content1, res); } }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void mapsFromPairsToPairs() { List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); // Regression test for SPARK-668: JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator()); swapped.collect(); // There was never a bug here, but it's worth testing: pairRDD.map(Tuple2::swap).collect(); }
JavaRDD<String> rdd = sc.parallelize( Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB")); JavaRDD<String> result = rdd.mapPartitions( System.out.println(StringUtils.join(result.collect(), ","));
@Test public void mapPartitions() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2); JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> { int sum = 0; while (iter.hasNext()) { sum += iter.next(); } return Collections.singletonList(sum).iterator(); }); assertEquals("[3, 7]", partitionSums.collect().toString()); }
@SuppressWarnings("unchecked") @Test public void hadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir, SequenceFileInputFormat.class, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }