@SuppressWarnings("unchecked") @Test public void sparkContextUnion() { // Union of non-specialized JavaRDDs List<String> strings = Arrays.asList("Hello", "World"); JavaRDD<String> s1 = sc.parallelize(strings); JavaRDD<String> s2 = sc.parallelize(strings); // Varargs JavaRDD<String> sUnion = sc.union(s1, s2); assertEquals(4, sUnion.count()); // List List<JavaRDD<String>> list = new ArrayList<>(); list.add(s2); sUnion = sc.union(s1, list); assertEquals(4, sUnion.count()); // Union of JavaDoubleRDDs List<Double> doubles = Arrays.asList(1.0, 2.0); JavaDoubleRDD d1 = sc.parallelizeDoubles(doubles); JavaDoubleRDD d2 = sc.parallelizeDoubles(doubles); JavaDoubleRDD dUnion = sc.union(d1, d2); assertEquals(4, dUnion.count()); // Union of JavaPairRDDs List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(1, 2)); pairs.add(new Tuple2<>(3, 4)); JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> pUnion = sc.union(p1, p2); assertEquals(4, pUnion.count()); }
@SuppressWarnings("unchecked") @Test public void sparkContextUnion() { // Union of non-specialized JavaRDDs List<String> strings = Arrays.asList("Hello", "World"); JavaRDD<String> s1 = sc.parallelize(strings); JavaRDD<String> s2 = sc.parallelize(strings); // Varargs JavaRDD<String> sUnion = sc.union(s1, s2); assertEquals(4, sUnion.count()); // List List<JavaRDD<String>> list = new ArrayList<>(); list.add(s2); sUnion = sc.union(s1, list); assertEquals(4, sUnion.count()); // Union of JavaDoubleRDDs List<Double> doubles = Arrays.asList(1.0, 2.0); JavaDoubleRDD d1 = sc.parallelizeDoubles(doubles); JavaDoubleRDD d2 = sc.parallelizeDoubles(doubles); JavaDoubleRDD dUnion = sc.union(d1, d2); assertEquals(4, dUnion.count()); // Union of JavaPairRDDs List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(1, 2)); pairs.add(new Tuple2<>(3, 4)); JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> pUnion = sc.union(p1, p2); assertEquals(4, pUnion.count()); }
@SuppressWarnings("unchecked") @Test public void sparkContextUnion() { // Union of non-specialized JavaRDDs List<String> strings = Arrays.asList("Hello", "World"); JavaRDD<String> s1 = sc.parallelize(strings); JavaRDD<String> s2 = sc.parallelize(strings); // Varargs JavaRDD<String> sUnion = sc.union(s1, s2); assertEquals(4, sUnion.count()); // List List<JavaRDD<String>> list = new ArrayList<>(); list.add(s2); sUnion = sc.union(s1, list); assertEquals(4, sUnion.count()); // Union of JavaDoubleRDDs List<Double> doubles = Arrays.asList(1.0, 2.0); JavaDoubleRDD d1 = sc.parallelizeDoubles(doubles); JavaDoubleRDD d2 = sc.parallelizeDoubles(doubles); JavaDoubleRDD dUnion = sc.union(d1, d2); assertEquals(4, dUnion.count()); // Union of JavaPairRDDs List<Tuple2<Integer, Integer>> pairs = new ArrayList<>(); pairs.add(new Tuple2<>(1, 2)); pairs.add(new Tuple2<>(3, 4)); JavaPairRDD<Integer, Integer> p1 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> p2 = sc.parallelizePairs(pairs); JavaPairRDD<Integer, Integer> pUnion = sc.union(p1, p2); assertEquals(4, pUnion.count()); }
sc.union(mergingSegs.toArray(new JavaPairRDD[mergingSegs.size()])) .reduceByKey(reduceFunction, SparkUtil.estimateTotalPartitionNum(cubeStatsReader, envConfig)) .mapToPair(convertTextFunction).saveAsNewAPIHadoopDataset(job.getConfiguration()); FileOutputFormat.setOutputPath(job, new Path(cuboidOutputPath)); sc.union(mergingSegs.toArray(new JavaPairRDD[mergingSegs.size()])) .reduceByKey(reduceFunction, SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig))
static JavaRDD<String> readInputFiles(JavaSparkContext ctx, String filename) throws Exception { List<String> biosetFiles = toList(filename); int counter = 0; JavaRDD[] rdds = new JavaRDD[biosetFiles.size()]; for (String biosetFileName : biosetFiles) { System.out.println("readInputFiles(): biosetFileName=" + biosetFileName); JavaRDD<String> record = ctx.textFile(biosetFileName); rdds[counter] = record; counter++; } JavaRDD<String> allBiosets = ctx.union(rdds); return allBiosets; //return allBiosets.coalesce(9, false); } // readInputFiles
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts( JavaSparkContext context, JavaPairRDD<String, Long>[] ranks) { JavaPairRDD<String, Long> unionRDD = context.union(ranks); // next find unique keys, with their associated copa scores JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey(); // next calculate ranked products and the number of elements JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues((Iterable<Long> values) -> { int N = 0; long products = 1; for (Long v : values) { products *= v; N++; } double rankedProduct = Math.pow( (double) products, 1.0/((double) N)); return new Tuple2<Double, Integer>(rankedProduct, N); } // input: copa scores for all studies // output: (rankedProduct, N) ); return rankedProducts; }
private static JavaRDD<String> readInputFiles(JavaSparkContext ctx, String filename) throws Exception { List<String> biosetFiles = toList(filename); int counter = 0; JavaRDD[] rdds = new JavaRDD[biosetFiles.size()]; for (String biosetFileName : biosetFiles) { System.out.println("debug1 biosetFileName=" + biosetFileName); JavaRDD<String> record = ctx.textFile(biosetFileName); rdds[counter] = record; counter++; } JavaRDD<String> allBiosets = ctx.union(rdds); return allBiosets.coalesce(9, false); }
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProducts( JavaSparkContext context, JavaPairRDD<String, Long>[] ranks) { JavaPairRDD<String, Long> unionRDD = context.union(ranks); // next find unique keys, with their associated copa scores JavaPairRDD<String, Iterable<Long>> groupedByGeneRDD = unionRDD.groupByKey(); // next calculate ranked products and the number of elements JavaPairRDD<String, Tuple2<Double, Integer>> rankedProducts = groupedByGeneRDD.mapValues( new Function< Iterable<Long>, // input: copa scores for all studies Tuple2<Double, Integer> // output: (rankedProduct, N) >() { @Override public Tuple2<Double, Integer> call(Iterable<Long> values) { int N = 0; long products = 1; for (Long v : values) { products *= v; N++; } double rankedProduct = Math.pow( (double) products, 1.0/((double) N)); return new Tuple2<Double, Integer>(rankedProduct, N); } }); return rankedProducts; }
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProductsUsingCombineByKey( JavaSparkContext context, JavaPairRDD<String, Long>[] ranks) { JavaPairRDD<String, Long> unionRDD = context.union(ranks);
static JavaPairRDD<String, Tuple2<Double, Integer>> computeRankedProductsUsingCombineByKey( JavaSparkContext context, JavaPairRDD<String, Long>[] ranks) { JavaPairRDD<String, Long> unionRDD = context.union(ranks);
private JavaRDDLike<?, ?> getJavaRDDLikeInternal(SparkRuntime runtime) { List<PCollectionImpl<?>> parents = getParents(); JavaRDD[] rdds = new JavaRDD[parents.size()]; for (int i = 0; i < rdds.length; i++) { if (parents.get(i) instanceof PTableBase) { JavaPairRDD prdd = (JavaPairRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); rdds[i] = prdd.mapPartitions(new FlatMapPairDoFn(IdentityFn.getInstance(), runtime.getRuntimeContext())); } else { rdds[i] = (JavaRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); } } return runtime.getSparkContext().union(rdds); } }
private JavaRDDLike<?, ?> getJavaRDDLikeInternal(SparkRuntime runtime) { List<PCollectionImpl<?>> parents = getParents(); JavaPairRDD[] rdds = new JavaPairRDD[parents.size()]; for (int i = 0; i < rdds.length; i++) { if (parents.get(i) instanceof PTableBase) { rdds[i] = (JavaPairRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); } else { JavaRDD rdd = (JavaRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); rdds[i] = rdd.mapPartitionsToPair(new PairFlatMapDoFn(IdentityFn.getInstance(), runtime.getRuntimeContext())); } } return runtime.getSparkContext().union(rdds); } }
private JavaRDDLike<?, ?> getJavaRDDLikeInternal(SparkRuntime runtime) { List<PCollectionImpl<?>> parents = getParents(); JavaPairRDD[] rdds = new JavaPairRDD[parents.size()]; for (int i = 0; i < rdds.length; i++) { if (parents.get(i) instanceof PTableBase) { rdds[i] = (JavaPairRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); } else { JavaRDD rdd = (JavaRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); rdds[i] = rdd.mapPartitionsToPair(new PairFlatMapDoFn(IdentityFn.getInstance(), runtime.getRuntimeContext())); } } return runtime.getSparkContext().union(rdds); } }
private JavaRDDLike<?, ?> getJavaRDDLikeInternal(SparkRuntime runtime) { List<PCollectionImpl<?>> parents = getParents(); JavaRDD[] rdds = new JavaRDD[parents.size()]; for (int i = 0; i < rdds.length; i++) { if (parents.get(i) instanceof PTableBase) { JavaPairRDD prdd = (JavaPairRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); rdds[i] = prdd.mapPartitions(new FlatMapPairDoFn(IdentityFn.getInstance(), runtime.getRuntimeContext())); } else { rdds[i] = (JavaRDD) ((SparkCollection) parents.get(i)).getJavaRDDLike(runtime); } } return runtime.getSparkContext().union(rdds); } }
sc.union(mergingSegs.toArray(new JavaPairRDD[mergingSegs.size()])) .reduceByKey(reduceFunction, SparkUtil.estimateTotalPartitionNum(cubeStatsReader, envConfig)) .mapToPair(convertTextFunction).saveAsNewAPIHadoopDataset(job.getConfiguration()); FileOutputFormat.setOutputPath(job, new Path(cuboidOutputPath)); sc.union(mergingSegs.toArray(new JavaPairRDD[mergingSegs.size()])) .reduceByKey(reduceFunction, SparkUtil.estimateLayerPartitionNum(level, cubeStatsReader, envConfig))
combinedResults = javaSC.union(combinedResults, tierMatches); combinedResults = combinedResults.reduceByKey((id1, id2) -> id1);