@Test public void binaryFilesCaching() throws Exception { // Reusing the wholeText files example byte[] content1 = "spark is easy to use.\n".getBytes(StandardCharsets.UTF_8); String tempDirName = tempDir.getAbsolutePath(); File file1 = new File(tempDirName + "/part-00000"); FileOutputStream fos1 = new FileOutputStream(file1); FileChannel channel1 = fos1.getChannel(); ByteBuffer bbuf = ByteBuffer.wrap(content1); channel1.write(bbuf); channel1.close(); JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName).cache(); readRDD.foreach(pair -> pair._2().toArray()); // force the file to read List<Tuple2<String, PortableDataStream>> result = readRDD.collect(); for (Tuple2<String, PortableDataStream> res : result) { assertArrayEquals(content1, res._2().toArray()); } }
@Test public void binaryFilesCaching() throws Exception { // Reusing the wholeText files example byte[] content1 = "spark is easy to use.\n".getBytes(StandardCharsets.UTF_8); String tempDirName = tempDir.getAbsolutePath(); File file1 = new File(tempDirName + "/part-00000"); FileOutputStream fos1 = new FileOutputStream(file1); FileChannel channel1 = fos1.getChannel(); ByteBuffer bbuf = ByteBuffer.wrap(content1); channel1.write(bbuf); channel1.close(); JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName).cache(); readRDD.foreach(pair -> pair._2().toArray()); // force the file to read List<Tuple2<String, PortableDataStream>> result = readRDD.collect(); for (Tuple2<String, PortableDataStream> res : result) { assertArrayEquals(content1, res._2().toArray()); } }
@Test public void binaryFilesCaching() throws Exception { // Reusing the wholeText files example byte[] content1 = "spark is easy to use.\n".getBytes(StandardCharsets.UTF_8); String tempDirName = tempDir.getAbsolutePath(); File file1 = new File(tempDirName + "/part-00000"); FileOutputStream fos1 = new FileOutputStream(file1); FileChannel channel1 = fos1.getChannel(); ByteBuffer bbuf = ByteBuffer.wrap(content1); channel1.write(bbuf); channel1.close(); JavaPairRDD<String, PortableDataStream> readRDD = sc.binaryFiles(tempDirName).cache(); readRDD.foreach(pair -> pair._2().toArray()); // force the file to read List<Tuple2<String, PortableDataStream>> result = readRDD.collect(); for (Tuple2<String, PortableDataStream> res : result) { assertArrayEquals(content1, res._2().toArray()); } }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
/** * Cache the result RDD, since K-Means is an iterative machine learning * algorithm and the result will be used many times * * @param wikiData, a featureized data * @param context a Java spark context object * @return JavaPairRDD<String, Vector>, where K is <project_code> + " " + <page_title> * and V is a Vector of features * */ static JavaPairRDD<String, Vector> getFeatureizedData( String wikiData, JavaSparkContext context) { // JavaPairRDD<String, Vector> data = context.textFile(wikiData).mapToPair( new PairFunction<String, String, Vector>() { @Override public Tuple2<String, Vector> call(String in) throws Exception { // in: <key><#><feature_1><,><feature_2><,>...<,><feature_24> String[] parts = StringUtils.split(in, "#"); return new Tuple2<String, Vector>(parts[0], Util.buildVector(parts[1], ",")); } }).cache(); return data; }
/** * Cache the underlying RDD. * @return the cached {@link StructureDataRDD} */ public StructureDataRDD cache() { javaPairRdd.cache(); return this; }
/** * Cache the data. Good if the user wants to produce multiple analyses off the same * data. * @return the cached data object. */ public SegmentDataRDD cache() { return new SegmentDataRDD(segmentRDD.cache()); }
@Override public MPairStream<T, U> cache() { return new SparkPairStream<>(rdd.cache()); }
/** * Cache the result RDD, since K-Means is an iterative machine learning * algorithm and the result will be used many times * * @param wikiData, a featureized data * @param context a Java spark context object * @return JavaPairRDD<String, Vector>, where K is <project_code> + " " + <page_title> * and V is a Vector of features * */ static JavaPairRDD<String, Vector> getFeatureizedData( String wikiData, JavaSparkContext context) { // return context.textFile(wikiData).mapToPair( (PairFunction<String, String, Vector>) in -> { // in: <key><#><feature_1><,><feature_2><,>...<,><feature_24> String[] parts = StringUtils.split(in, "#"); return new Tuple2<>(parts[0], Util.buildVector(parts[1], ",")); }).cache(); }
instances.cache();
String[] parts = SPACES.split(s); return new Tuple2<>(parts[0], parts[1]); }).distinct().groupByKey().cache();
long numVariables = oneFullAssignment.cache().count(); config.endLog("[gaussianFitApp] oneFullAssignment"); constraints.A().cache().count(); constraints.B().cache().count(); long totalNumConstraints = constraints.C().cache().count(); config.endLog("[gaussianFitApp] constraints generation"); System.out.println("[numconstraints: " + totalNumConstraints + "]");
long numVariables2 = oneFullAssignment.cache().count(); config.endLog("[matmulApp] oneFullAssignment"); constraints.A().cache().count(); constraints.B().cache().count(); long totalNumConstraints = constraints.C().cache().count(); config.endLog("[matmulApp] constraints generation"); System.out.println("[numconstraints: " + totalNumConstraints + "]");
rdd.getRDD().cache();