@Override public JavaPairRDD<KO, VO> transform( JavaPairRDD<KI, VI> input) { if (caching) { if (cachedRDD == null) { cachedRDD = doTransform(input); cachedRDD.persist(StorageLevel.MEMORY_AND_DISK()); } return cachedRDD; } else { return doTransform(input); } }
@Override public JavaPairRDD<HiveKey, BytesWritable> transform(JavaPairRDD<HiveKey, BytesWritable> input) { JavaPairRDD<HiveKey, BytesWritable> result = shuffler.shuffle(input, numOfPartitions); if (toCache) { sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } return result; }
@Override public JavaPairRDD<WritableComparable, Writable> transform( JavaPairRDD<WritableComparable, Writable> input) { Preconditions.checkArgument(input == null, "AssertionError: MapInput doesn't take any input"); JavaPairRDD<WritableComparable, Writable> result; if (toCache) { result = hadoopRDD.mapToPair(new CopyFunction()); sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } else { result = hadoopRDD; } result.setName(this.name); return result; }
@Override public JavaPairRDD<HiveKey, BytesWritable> transform(JavaPairRDD<HiveKey, BytesWritable> input) { JavaPairRDD<HiveKey, BytesWritable> result = shuffler.shuffle(input, numOfPartitions); if (toCache) { sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } return result.setName(this.name + " (" + edge.getShuffleType() + ", " + numOfPartitions + (toCache ? ", cached)" : ")")); }
}).persist(StorageLevel.MEMORY_AND_DISK());
@Override public JavaPairRDD<WritableComparable, Writable> transform( JavaPairRDD<WritableComparable, Writable> input) { Preconditions.checkArgument(input == null, "AssertionError: MapInput doesn't take any input"); JavaPairRDD<WritableComparable, Writable> result; if (toCache) { result = hadoopRDD.mapToPair(new CopyFunction()); sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } else { result = hadoopRDD; } return result; }
@Override public JavaPairRDD<KO, VO> transform( JavaPairRDD<KI, VI> input) { if (caching) { if (cachedRDD == null) { cachedRDD = doTransform(input); cachedRDD.persist(StorageLevel.MEMORY_AND_DISK()); } return cachedRDD.setName(this.name + " (" + cachedRDD.getNumPartitions() + ", cached)"); } else { JavaPairRDD<KO, VO> rdd = doTransform(input); return rdd.setName(this.name + " (" + rdd.getNumPartitions() + ")"); } }
@Override public JavaPairRDD<HiveKey, BytesWritable> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { JavaPairRDD<HiveKey, BytesWritable> rdd; if (totalOrder) { if (numPartitions > 0) { if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) { input.persist(StorageLevel.DISK_ONLY()); sparkPlan.addCachedRDDId(input.id()); } rdd = input.sortByKey(true, numPartitions); } else { rdd = input.sortByKey(true); } } else { Partitioner partitioner = new HashPartitioner(numPartitions); rdd = input.repartitionAndSortWithinPartitions(partitioner); } return rdd; }
@SuppressWarnings("unchecked") @Test public void persist() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY()); assertEquals(20, doubleRDD.sum(), 0.1); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY()); assertEquals("a", pairRDD.first()._2()); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); rdd = rdd.persist(StorageLevel.DISK_ONLY()); assertEquals(1, rdd.first().intValue()); }
@SuppressWarnings("unchecked") @Test public void persist() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY()); assertEquals(20, doubleRDD.sum(), 0.1); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY()); assertEquals("a", pairRDD.first()._2()); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); rdd = rdd.persist(StorageLevel.DISK_ONLY()); assertEquals(1, rdd.first().intValue()); }
@SuppressWarnings("unchecked") @Test public void persist() { JavaDoubleRDD doubleRDD = sc.parallelizeDoubles(Arrays.asList(1.0, 1.0, 2.0, 3.0, 5.0, 8.0)); doubleRDD = doubleRDD.persist(StorageLevel.DISK_ONLY()); assertEquals(20, doubleRDD.sum(), 0.1); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> pairRDD = sc.parallelizePairs(pairs); pairRDD = pairRDD.persist(StorageLevel.DISK_ONLY()); assertEquals("a", pairRDD.first()._2()); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); rdd = rdd.persist(StorageLevel.DISK_ONLY()); assertEquals(1, rdd.first().intValue()); }
@Override public JavaPairRDD<HiveKey, BytesWritable> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { JavaPairRDD<HiveKey, BytesWritable> rdd; if (totalOrder) { if (numPartitions > 0) { if (numPartitions > 1 && input.getStorageLevel() == StorageLevel.NONE()) { input.persist(StorageLevel.DISK_ONLY()); sparkPlan.addCachedRDDId(input.id()); } rdd = input.sortByKey(true, numPartitions); } else { rdd = input.sortByKey(true); } } else { Partitioner partitioner = new HashPartitioner(numPartitions); rdd = input.repartitionAndSortWithinPartitions(partitioner); } if (shuffleSerializer != null) { if (rdd.rdd() instanceof ShuffledRDD) { ((ShuffledRDD) rdd.rdd()).setSerializer(shuffleSerializer); } } return rdd; }
@Override public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, final JavaPairRDD<K, V> memoryRDD) { if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false)) LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true"); if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to"); final String memoryRDDName = Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), memoryKey); Spark.removeRDD(memoryRDDName); memoryRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"))) // call action to eager store rdd .count(); Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD return IteratorUtils.map(memoryRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2())); }
@Override public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) { if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false)) LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true"); if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to"); SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)); // this might be bad cause it unpersists the job RDD // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache() final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")); if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true)) graphRDD.mapValues(vertex -> { vertex.get().dropEdges(Direction.BOTH); return vertex; }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel) // call action to eager store rdd .count(); else graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel) // call action to eager store rdd .count(); Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD }
@Override public JavaPairRDD<HiveKey, Iterable<BytesWritable>> transform(JavaPairRDD<HiveKey, BytesWritable> input) { JavaPairRDD<HiveKey, Iterable<BytesWritable>> result = shuffler.shuffle(input, numOfPartitions); if (toCache) { sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } return result; } }
@Override public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, final JavaPairRDD<K, V> memoryRDD) { if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false)) LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true"); if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to"); final String memoryRDDName = Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), memoryKey); Spark.removeRDD(memoryRDDName); memoryRDD.setName(memoryRDDName).persist(StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY"))) // call action to eager store rdd .count(); Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD return IteratorUtils.map(memoryRDD.collect().iterator(), tuple -> new KeyValue<>(tuple._1(), tuple._2())); }
@Override public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) { if (!configuration.getBoolean(Constants.GREMLIN_SPARK_PERSIST_CONTEXT, false)) LOGGER.warn("The SparkContext should be persisted in order for the RDD to persist across jobs. To do so, set " + Constants.GREMLIN_SPARK_PERSIST_CONTEXT + " to true"); if (!configuration.containsKey(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_OUTPUT_LOCATION + " to write the persisted RDD to"); SparkContextStorage.open(configuration).rm(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)); // this might be bad cause it unpersists the job RDD // determine which storage level to persist the RDD as with MEMORY_ONLY being the default cache() final StorageLevel storageLevel = StorageLevel.fromString(configuration.getString(Constants.GREMLIN_SPARK_PERSIST_STORAGE_LEVEL, "MEMORY_ONLY")); if (!configuration.getBoolean(Constants.GREMLIN_HADOOP_GRAPH_WRITER_HAS_EDGES, true)) graphRDD.mapValues(vertex -> { vertex.get().dropEdges(Direction.BOTH); return vertex; }).setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel) // call action to eager store rdd .count(); else graphRDD.setName(Constants.getGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))).persist(storageLevel) // call action to eager store rdd .count(); Spark.refresh(); // necessary to do really fast so the Spark GC doesn't clear out the RDD }
@Override public JavaPairRDD<WritableComparable, Writable> transform( JavaPairRDD<WritableComparable, Writable> input) { Preconditions.checkArgument(input == null, "AssertionError: MapInput doesn't take any input"); JavaPairRDD<WritableComparable, Writable> result; if (toCache) { result = hadoopRDD.mapToPair(new CopyFunction()); sparkPlan.addCachedRDDId(result.id()); result = result.persist(StorageLevel.MEMORY_AND_DISK()); } else { result = hadoopRDD; } return result; }