public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]"); } String master = args[0]; String fileName = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS")); List<Tuple2<String, Integer>> input = new ArrayList(); input.add(new Tuple2("coffee", 1)); input.add(new Tuple2("coffee", 2)); input.add(new Tuple2("pandas", 3)); JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input); JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes()); result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class); } }
@SuppressWarnings("unchecked") @Test public void collectAsMapAndSerialize() throws Exception { JavaPairRDD<String,Integer> rdd = sc.parallelizePairs(Arrays.asList(new Tuple2<>("foo", 1))); Map<String,Integer> map = rdd.collectAsMap(); ByteArrayOutputStream bytes = new ByteArrayOutputStream(); new ObjectOutputStream(bytes).writeObject(map); Map<String,Integer> deserializedMap = (Map<String,Integer>) new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())).readObject(); assertEquals(1, deserializedMap.get("foo").intValue()); }
@SuppressWarnings("unchecked") @Test public void collectAsMapAndSerialize() throws Exception { JavaPairRDD<String,Integer> rdd = sc.parallelizePairs(Arrays.asList(new Tuple2<>("foo", 1))); Map<String,Integer> map = rdd.collectAsMap(); ByteArrayOutputStream bytes = new ByteArrayOutputStream(); new ObjectOutputStream(bytes).writeObject(map); Map<String,Integer> deserializedMap = (Map<String,Integer>) new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())).readObject(); assertEquals(1, deserializedMap.get("foo").intValue()); }
@SuppressWarnings("unchecked") @Test public void collectAsMapAndSerialize() throws Exception { JavaPairRDD<String,Integer> rdd = sc.parallelizePairs(Arrays.asList(new Tuple2<>("foo", 1))); Map<String,Integer> map = rdd.collectAsMap(); ByteArrayOutputStream bytes = new ByteArrayOutputStream(); new ObjectOutputStream(bytes).writeObject(map); Map<String,Integer> deserializedMap = (Map<String,Integer>) new ObjectInputStream(new ByteArrayInputStream(bytes.toByteArray())).readObject(); assertEquals(1, deserializedMap.get("foo").intValue()); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
@SuppressWarnings("unchecked") @Test public void lookup() { JavaPairRDD<String, String> categories = sc.parallelizePairs(Arrays.asList( new Tuple2<>("Apples", "Fruit"), new Tuple2<>("Oranges", "Fruit"), new Tuple2<>("Oranges", "Citrus") )); assertEquals(2, categories.lookup("Oranges").size()); assertEquals(2, Iterables.size(categories.groupByKey().lookup("Oranges").get(0))); }
@SuppressWarnings("unchecked") @Test public void objectFilesOfComplexTypes() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.saveAsObjectFile(outputDir); // Try reading the output back as an object file JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir); assertEquals(pairs, readRDD.collect()); }
@SuppressWarnings("unchecked") @Test public void objectFilesOfComplexTypes() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.saveAsObjectFile(outputDir); // Try reading the output back as an object file JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir); assertEquals(pairs, readRDD.collect()); }
@SuppressWarnings("unchecked") @Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); assertEquals(1, sums.lookup(1).get(0).intValue()); assertEquals(2, sums.lookup(2).get(0).intValue()); assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); assertEquals(1, sums.lookup(1).get(0).intValue()); assertEquals(2, sums.lookup(2).get(0).intValue()); assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); assertEquals(1, sums.lookup(1).get(0).intValue()); assertEquals(2, sums.lookup(2).get(0).intValue()); assertEquals(3, sums.lookup(3).get(0).intValue()); }
@Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); Assert.assertEquals(1, sums.lookup(1).get(0).intValue()); Assert.assertEquals(2, sums.lookup(2).get(0).intValue()); Assert.assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void objectFilesOfComplexTypes() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.saveAsObjectFile(outputDir); // Try reading the output back as an object file JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir); assertEquals(pairs, readRDD.collect()); }
@Test public void foldByKey() { List<Tuple2<Integer, Integer>> pairs = Arrays.asList( new Tuple2<>(2, 1), new Tuple2<>(2, 1), new Tuple2<>(1, 1), new Tuple2<>(3, 2), new Tuple2<>(3, 1) ); JavaPairRDD<Integer, Integer> rdd = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> sums = rdd.foldByKey(0, (a, b) -> a + b); Assert.assertEquals(1, sums.lookup(1).get(0).intValue()); Assert.assertEquals(2, sums.lookup(2).get(0).intValue()); Assert.assertEquals(3, sums.lookup(3).get(0).intValue()); }
@SuppressWarnings("unchecked") @Test public void hadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir, SequenceFileInputFormat.class, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@SuppressWarnings("unchecked") @Test public void hadoopFileCompressed() { String outputDir = new File(tempDir, "output_compressed").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class, DefaultCodec.class); JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir, SequenceFileInputFormat.class, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@Test public void mapsFromPairsToPairs() { List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); // Regression test for SPARK-668: JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator()); swapped.collect(); // There was never a bug here, but it's worth testing: pairRDD.map(Tuple2::swap).collect(); }
@Test public void mapsFromPairsToPairs() { List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); // Regression test for SPARK-668: JavaPairRDD<String, Integer> swapped = pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator()); swapped.collect(); // There was never a bug here, but it's worth testing: pairRDD.map(Tuple2::swap).collect(); }
@Test public void getNumPartitions(){ JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8), 3); JavaDoubleRDD rdd2 = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0), 2); JavaPairRDD<String, Integer> rdd3 = sc.parallelizePairs( Arrays.asList( new Tuple2<>("a", 1), new Tuple2<>("aa", 2), new Tuple2<>("aaa", 3) ), 2); assertEquals(3, rdd1.getNumPartitions()); assertEquals(2, rdd2.getNumPartitions()); assertEquals(2, rdd3.getNumPartitions()); }