org.apache.spark.api.java.JavaPairRDD.map java code examples

private static void saveFeaturesRDD(JavaPairRDD<Integer,float[]> features,
                  Path path,
                  Broadcast<? extends Map<Integer,String>> bIndexToID) {
 log.info("Saving features RDD to {}", path);
 features.map(keyAndVector -> {
  String id = bIndexToID.value().get(keyAndVector._1());
  float[] vector = keyAndVector._2();
  return TextUtils.joinJSON(Arrays.asList(id, vector));
 }).saveAsTextFile(path.toString(), GzipCodec.class);
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

/**
 * Combines {@link Rating}s with the same user/item into one, with score as the sum of
 * all of the scores.
 */
private JavaRDD<Rating> aggregateScores(JavaRDD<? extends Rating> original, double epsilon) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> tuples =
   original.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 JavaPairRDD<Tuple2<Integer,Integer>,Double> aggregated;
 if (implicit) {
  // TODO can we avoid groupByKey? reduce, combine, fold don't seem viable since
  // they don't guarantee the delete elements are properly handled
  aggregated = tuples.groupByKey().mapValues(MLFunctions.SUM_WITH_NAN);
 } else {
  // For non-implicit, last wins.
  aggregated = tuples.foldByKey(Double.NaN, (current, next) -> next);
 }
 JavaPairRDD<Tuple2<Integer,Integer>,Double> noNaN =
   aggregated.filter(kv -> !Double.isNaN(kv._2()));
 if (logStrength) {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    Math.log1p(userProductScore._2() / epsilon)));
 } else {
  return noNaN.map(userProductScore -> new Rating(
    userProductScore._1()._1(),
    userProductScore._1()._2(),
    userProductScore._2()));
 }
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped =
  pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.map(Tuple2::swap).collect();
}

if (model.isLogStrength()) {
 double epsilon = model.getEpsilon();
 inputRDD = noNaN.map(tuple -> new UserItemStrength(tuple._1()._1(), tuple._1()._2(),
                           (float) Math.log1p(tuple._2() / epsilon)));
} else {
 inputRDD = noNaN.map(tuple -> new UserItemStrength(tuple._1()._1(), tuple._1()._2(),
                           tuple._2().floatValue()));

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped =
  pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.map(Tuple2::swap).collect();
}

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@Test
public void mapsFromPairsToPairs() {
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 // Regression test for SPARK-668:
 JavaPairRDD<String, Integer> swapped =
  pairRDD.flatMapToPair(x -> Collections.singletonList(x.swap()).iterator());
 swapped.collect();
 // There was never a bug here, but it's worth testing:
 pairRDD.map(Tuple2::swap).collect();
}

@SuppressWarnings("unchecked")
@Test
public void readWithNewAPIHadoopFile() throws IOException {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.newAPIHadoopFile(outputDir,
  org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat.class,
  IntWritable.class, Text.class, Job.getInstance().getConfiguration());
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

JavaRDD<DataSet> data = sc.binaryFiles(testDir + "/*").map(new LoadDataFunction());

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void hadoopFileCompressed() {
 String outputDir = new File(tempDir, "output_compressed").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsHadoopFile(outputDir, IntWritable.class, Text.class,
   SequenceFileOutputFormat.class, DefaultCodec.class);
 JavaPairRDD<IntWritable, Text> output = sc.hadoopFile(outputDir,
  SequenceFileInputFormat.class, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

Map<String, Integer> userIdLookupMap_wrapped = new HashMap<String, Integer>(userIdLookupMap);
final Broadcast<Map<String, Integer>> broadcastVar = jsc.broadcast(userIdLookupMap_wrapped);
JavaRDD<String> json_only_with_zeros = filtered_by_client.map(new Function<Tuple2<String, ActionData>, String>() {

Popular methods of JavaPairRDD

Popular in Java

Start an intent from android
onCreateOptionsMenu (Activity)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
setContentView (Activity)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
BitSet (java.util)
The BitSet class implements abit array [http://en.wikipedia.org/wiki/Bit_array]. Each element is eit
Iterator (java.util)
An iterator over a sequence of objects, such as a collection.If a collection has been changed since
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Top plugins for WebStorm

How to use mapmethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.map (Showing top 20 results out of 450)

How to use
map
method
in
org.apache.spark.api.java.JavaPairRDD