org.apache.spark.api.java.JavaRDD.collect java code examples

Refine search

 public static void main(String[] args) throws Exception {
  String master;
  if (args.length > 0) {
   master = args[0];
  } else {
   master = "local";
  }
  JavaSparkContext sc = new JavaSparkContext(
   master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
  JavaRDD<Integer> result = rdd.map(
   new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}});
  System.out.println(StringUtils.join(result.collect(), ","));
 }
}

@Test
public void mapPartitions() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2);
 JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> {
   int sum = 0;
   while (iter.hasNext()) {
    sum += iter.next();
   }
   return Collections.singletonList(sum).iterator();
  });
 assertEquals("[3, 7]", partitionSums.collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void objectFilesOfComplexTypes() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.saveAsObjectFile(outputDir);
 // Try reading the output back as an object file
 JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir);
 assertEquals(pairs, readRDD.collect());
}

@Test
public void binaryRecords() throws Exception {
 // Reusing the wholeText files example
 byte[] content1 = "spark isn't always easy to use.\n".getBytes(StandardCharsets.UTF_8);
 int numOfCopies = 10;
 String tempDirName = tempDir.getAbsolutePath();
 File file1 = new File(tempDirName + "/part-00000");
 FileOutputStream fos1 = new FileOutputStream(file1);
 FileChannel channel1 = fos1.getChannel();
 for (int i = 0; i < numOfCopies; i++) {
  ByteBuffer bbuf = ByteBuffer.wrap(content1);
  channel1.write(bbuf);
 }
 channel1.close();
 JavaRDD<byte[]> readRDD = sc.binaryRecords(tempDirName, content1.length);
 assertEquals(numOfCopies,readRDD.count());
 List<byte[]> result = readRDD.collect();
 for (byte[] res : result) {
  assertArrayEquals(content1, res);
 }
}

@Test
public void map() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache();
 doubles.collect();
 JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x))
  .cache();
 pairs.collect();
 JavaRDD<String> strings = rdd.map(Object::toString).cache();
 strings.collect();
}

@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped =
  pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.map(Tuple2::swap).collect();
}

@Test
public void mapPartitionsWithIndex() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2);
 JavaRDD<Integer> partitionSums = rdd.mapPartitionsWithIndex((index, iter) -> {
   int sum = 0;
   while (iter.hasNext()) {
    sum += iter.next();
   }
   return Collections.singletonList(sum).iterator();
  }, false);
 assertEquals("[3, 7]", partitionSums.collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void objectFilesOfComplexTypes() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.saveAsObjectFile(outputDir);
 // Try reading the output back as an object file
 JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir);
 assertEquals(pairs, readRDD.collect());
}

@Test
public void binaryRecords() throws Exception {
 // Reusing the wholeText files example
 byte[] content1 = "spark isn't always easy to use.\n".getBytes(StandardCharsets.UTF_8);
 int numOfCopies = 10;
 String tempDirName = tempDir.getAbsolutePath();
 File file1 = new File(tempDirName + "/part-00000");
 FileOutputStream fos1 = new FileOutputStream(file1);
 FileChannel channel1 = fos1.getChannel();
 for (int i = 0; i < numOfCopies; i++) {
  ByteBuffer bbuf = ByteBuffer.wrap(content1);
  channel1.write(bbuf);
 }
 channel1.close();
 JavaRDD<byte[]> readRDD = sc.binaryRecords(tempDirName, content1.length);
 assertEquals(numOfCopies,readRDD.count());
 List<byte[]> result = readRDD.collect();
 for (byte[] res : result) {
  assertArrayEquals(content1, res);
 }
}

@Test
public void map() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache();
 doubles.collect();
 JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache();
 pairs.collect();
 JavaRDD<String> strings = rdd.map(Object::toString).cache();
 strings.collect();
}

@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped =
  pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.map(Tuple2::swap).collect();
}

 public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
   master = args[0];
    } else {
      master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(
   master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
  JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
  JavaRDD<Integer> squared = rdd.map(
   new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}});
  JavaRDD<Integer> result = squared.filter(
   new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }});
  System.out.println(StringUtils.join(result.collect(), ","));
  }
}

@Test
public void mapPartitions() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2);
 JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> {
  int sum = 0;
  while (iter.hasNext()) {
   sum += iter.next();
  }
  return Collections.singletonList(sum).iterator();
 });
 Assert.assertEquals("[3, 7]", partitionSums.collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void objectFilesOfComplexTypes() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.saveAsObjectFile(outputDir);
 // Try reading the output back as an object file
 JavaRDD<Tuple2<Integer, String>> readRDD = sc.objectFile(outputDir);
 assertEquals(pairs, readRDD.collect());
}

@Test
public void binaryRecords() throws Exception {
 // Reusing the wholeText files example
 byte[] content1 = "spark isn't always easy to use.\n".getBytes(StandardCharsets.UTF_8);
 int numOfCopies = 10;
 String tempDirName = tempDir.getAbsolutePath();
 File file1 = new File(tempDirName + "/part-00000");
 FileOutputStream fos1 = new FileOutputStream(file1);
 FileChannel channel1 = fos1.getChannel();
 for (int i = 0; i < numOfCopies; i++) {
  ByteBuffer bbuf = ByteBuffer.wrap(content1);
  channel1.write(bbuf);
 }
 channel1.close();
 JavaRDD<byte[]> readRDD = sc.binaryRecords(tempDirName, content1.length);
 assertEquals(numOfCopies,readRDD.count());
 List<byte[]> result = readRDD.collect();
 for (byte[] res : result) {
  assertArrayEquals(content1, res);
 }
}

@Test
public void map() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache();
 doubles.collect();
 JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache();
 pairs.collect();
 JavaRDD<String> strings = rdd.map(Object::toString).cache();
 strings.collect();
}

@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped =
  pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.map(Tuple2::swap).collect();
}

JavaRDD<String> rdd = sc.parallelize(
 Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"));
JavaRDD<String> result = rdd.mapPartitions(
System.out.println(StringUtils.join(result.collect(), ","));

@Test
public void mapPartitions() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4), 2);
 JavaRDD<Integer> partitionSums = rdd.mapPartitions(iter -> {
   int sum = 0;
   while (iter.hasNext()) {
    sum += iter.next();
   }
   return Collections.singletonList(sum).iterator();
  });
 assertEquals("[3, 7]", partitionSums.collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

Popular methods of JavaRDD

Popular in Java

Making http requests using okhttp
setRequestProperty (URLConnection)
setScale (BigDecimal)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
Permission (java.security)
Legacy security code; do not use.
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
JPanel (javax.swing)
Runner (org.openjdk.jmh.runner)
Top plugins for WebStorm

How to use collectmethodin org.apache.spark.api.java.JavaRDD

Best Java code snippets using org.apache.spark.api.java.JavaRDD.collect (Showing top 20 results out of 567)

Refine search

How to use
collect
method
in
org.apache.spark.api.java.JavaRDD