org.apache.spark.api.java.JavaPairRDD.persist java code examples

@Override
public JavaPairRDD<KO, VO> transform(
 JavaPairRDD<KI, VI> input) {
 if (caching) {
  if (cachedRDD == null) {
   cachedRDD = doTransform(input);
   cachedRDD.persist(StorageLevel.MEMORY_AND_DISK());
  }
  return cachedRDD;
 } else {
  return doTransform(input);
 }
}

@Override
public JavaPairRDD<HiveKey, BytesWritable> transform(JavaPairRDD<HiveKey, BytesWritable> input) {
 JavaPairRDD<HiveKey, BytesWritable> result = shuffler.shuffle(input, numOfPartitions);
 if (toCache) {
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 }
 return result;
}

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 result.setName(this.name);
 return result;
}

@Override
public JavaPairRDD<HiveKey, BytesWritable> transform(JavaPairRDD<HiveKey, BytesWritable> input) {
 JavaPairRDD<HiveKey, BytesWritable> result = shuffler.shuffle(input, numOfPartitions);
 if (toCache) {
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 }
 return result.setName(this.name + " (" + edge.getShuffleType() + ", " + numOfPartitions +
     (toCache ? ", cached)" : ")"));
}

}).persist(StorageLevel.MEMORY_AND_DISK());

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 return result;
}

@Override
public JavaPairRDD<KO, VO> transform(
 JavaPairRDD<KI, VI> input) {
 if (caching) {
  if (cachedRDD == null) {
   cachedRDD = doTransform(input);
   cachedRDD.persist(StorageLevel.MEMORY_AND_DISK());
  }
  return cachedRDD.setName(this.name + " (" + cachedRDD.getNumPartitions() + ", cached)");
 } else {
  JavaPairRDD<KO, VO> rdd = doTransform(input);
  return rdd.setName(this.name + " (" + rdd.getNumPartitions() + ")");
 }
}

@Override
public JavaPairRDD<HiveKey, BytesWritable> shuffle(
  JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
 JavaPairRDD<HiveKey, BytesWritable> rdd;
 if (totalOrder) {
  if (numPartitions > 0) {
   if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) {
    input.persist(StorageLevel.DISK_ONLY());
    sparkPlan.addCachedRDDId(input.id());
   }
   rdd = input.sortByKey(true, numPartitions);
  } else {
   rdd = input.sortByKey(true);
  }
 } else {
  Partitioner partitioner = new HashPartitioner(numPartitions);
  rdd = input.repartitionAndSortWithinPartitions(partitioner);
 }
 return rdd;
}

@SuppressWarnings("unchecked")
@Test
public void persist() {
 JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
 doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY());
 assertEquals(20, doubleRDD.sum(), 0.1);
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY());
 assertEquals("a", pairRDD.first()._2());
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 rdd = rdd.persist(StorageLevel.DISK_ONLY());
 assertEquals(1, rdd.first().intValue());
}

@SuppressWarnings("unchecked")
@Test
public void persist() {
 JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
 doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY());
 assertEquals(20, doubleRDD.sum(), 0.1);
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY());
 assertEquals("a", pairRDD.first()._2());
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 rdd = rdd.persist(StorageLevel.DISK_ONLY());
 assertEquals(1, rdd.first().intValue());
}

@SuppressWarnings("unchecked")
@Test
public void persist() {
 JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0));
 doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY());
 assertEquals(20, doubleRDD.sum(), 0.1);
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs);
 pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY());
 assertEquals("a", pairRDD.first()._2());
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 rdd = rdd.persist(StorageLevel.DISK_ONLY());
 assertEquals(1, rdd.first().intValue());
}

@Override
public JavaPairRDD<HiveKey, BytesWritable> shuffle(
  JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) {
 JavaPairRDD<HiveKey, BytesWritable> rdd;
 if (totalOrder) {
  if (numPartitions > 0) {
   if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) {
    input.persist(StorageLevel.DISK_ONLY());
    sparkPlan.addCachedRDDId(input.id());
   }
   rdd = input.sortByKey(true, numPartitions);
  } else {
   rdd = input.sortByKey(true);
  }
 } else {
  Partitioner partitioner = new HashPartitioner(numPartitions);
  rdd = input.repartitionAndSortWithinPartitions(partitioner);
 }
 if (shuffleSerializer != null) {
  if (rdd.rdd() instanceof ShuffledRDD) {
   ((ShuffledRDD) rdd.rdd()).setSerializer(shuffleSerializer);
  }
 }
 return rdd;
}

allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);
      .persist(storageLevel);
  allRDDs[level - 1].unpersist();
  if (envConfig.isSparkSanityCheckEnabled() == true) {

@Override
public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, final JavaPairRDD<K, V> memoryRDD) {
  if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
    LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
  if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
    throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
  final String memoryRDDName = Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), memoryKey);
  Spark.removeRDD(memoryRDDName);
  memoryRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")))
      // call action to eager store rdd
      .count();
  Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
  return IteratorUtils.map(memoryRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2()));
}

@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
  if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
    LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
  if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
    throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
  SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));  // this might be bad cause it unpersists the job RDD
  // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
  final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
  if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
    graphRDD.mapValues(vertex -> {
      vertex.get().dropEdges(Direction.BOTH);
      return vertex;
    }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
        // call action to eager store rdd
        .count();
  else
    graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
        // call action to eager store rdd
        .count();
  Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
}

loadedGraphRDD = loadedGraphRDD.persist(StorageLevel.fromString(hadoopConfiguration.get(GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));
    mapReduceRDD = mapReduceRDD.persist(StorageLevel.fromString(hadoopConfiguration.get(GREMLIN_SPARK_GRAPH_STORAGE_LEVEL, "MEMORY_ONLY")));

 @Override
 public JavaPairRDD<HiveKey, Iterable<BytesWritable>> transform(JavaPairRDD<HiveKey, BytesWritable> input) {
  JavaPairRDD<HiveKey, Iterable<BytesWritable>> result = shuffler.shuffle(input, numOfPartitions);
  if (toCache) {
   sparkPlan.addCachedRDDId(result.id());
   result = result.persist(StorageLevel.MEMORY_AND_DISK());
  }
  return result;
 }
}

@Override
public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, final JavaPairRDD<K, V> memoryRDD) {
  if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
    LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
  if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
    throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
  final String memoryRDDName = Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), memoryKey);
  Spark.removeRDD(memoryRDDName);
  memoryRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")))
      // call action to eager store rdd
      .count();
  Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
  return IteratorUtils.map(memoryRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2()));
}

@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
  if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false))
    LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true");
  if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))
    throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to");
  SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION));  // this might be bad cause it unpersists the job RDD
  // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache()
  final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"));
  if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true))
    graphRDD.mapValues(vertex -> {
      vertex.get().dropEdges(Direction.BOTH);
      return vertex;
    }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
        // call action to eager store rdd
        .count();
  else
    graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel)
        // call action to eager store rdd
        .count();
  Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD
}

@Override
public JavaPairRDD<WritableComparable, Writable> transform(
  JavaPairRDD<WritableComparable, Writable> input) {
 Preconditions.checkArgument(input == null,
   "AssertionError: MapInput doesn't take any input");
 JavaPairRDD<WritableComparable, Writable> result;
 if (toCache) {
  result = hadoopRDD.mapToPair(new CopyFunction());
  sparkPlan.addCachedRDDId(result.id());
  result = result.persist(StorageLevel.MEMORY_AND_DISK());
 } else {
  result = hadoopRDD;
 }
 return result;
}

Popular methods of JavaPairRDD

Popular in Java

Running tasks concurrently on multiple threads
onCreateOptionsMenu (Activity)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
getExternalFilesDir (Context)
Socket (java.net)
Provides a client-side TCP socket.
SQLException (java.sql)
An exception that indicates a failed JDBC operation. It provides the following information about pro
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
TimeZone (java.util)
TimeZone represents a time zone offset, and also figures out daylight savings. Typically, you get a
Option (scala)
Best plugins for Eclipse

How to use persistmethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.persist (Showing top 20 results out of 315)

How to use
persist
method
in
org.apache.spark.api.java.JavaPairRDD