private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) { return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values() .map(new Function<Text, String[]>() { @Override public String[] call(Text text) throws Exception { String s = Bytes.toString(text.getBytes(), 0, text.getLength()); return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1); } }); }
private SparkStageInfo getStageInfo(int stageId) { return sparkContext.statusTracker().getStageInfo(stageId); } }
private SparkJobInfo getJobInfo() { return sparkContext.statusTracker().getJobInfo(jobId); }
public void run(String master, String csv1, String csv2) throws Exception { JavaSparkContext sc = new JavaSparkContext( master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> csvFile1 = sc.textFile(csv1); JavaRDD<String> csvFile2 = sc.textFile(csv2); JavaPairRDD<Integer, String[]> keyedRDD1 = csvFile1.mapToPair(new ParseLine()); JavaPairRDD<Integer, String[]> keyedRDD2 = csvFile1.mapToPair(new ParseLine()); JavaPairRDD<Integer, Tuple2<String[], String[]>> result = keyedRDD1.join(keyedRDD2); List<Tuple2<Integer, Tuple2<String[], String[]>>> resultCollection = result.collect(); } }
private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD, boolean user) { int offset = user ? 0 : 1; Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset]) .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions()) .zipWithIndex().mapValues(Long::intValue) .collectAsMap(); // Clone, due to some serialization problems with the result of collectAsMap? return new HashMap<>(reverseIDLookup); }
/** * @param trainPointData data to cluster * @param model trained KMeans Model * @return map of ClusterId, count of points associated with the clusterId */ private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData, KMeansModel model) { return trainPointData.map(model::predict).countByValue(); }
@Override double evaluate(JavaRDD<Vector> evalData) { return fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getSumSquaredDist).sum(); }
private static JavaPairRDD<Integer,Iterable<Rating>> predictAll( MatrixFactorizationModel mfModel, JavaRDD<Rating> data, JavaPairRDD<Integer,Integer> userProducts) { @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> userProductsRDD = (RDD<Tuple2<Object,Object>>) (RDD<?>) userProducts.rdd(); return data.wrapRDD(mfModel.predict(userProductsRDD)).groupBy(Rating::user); }
@Override public JavaPairRDD<HiveKey, Iterable<BytesWritable>> shuffle( JavaPairRDD<HiveKey, BytesWritable> input, int numPartitions) { if (numPartitions > 0) { return input.groupByKey(numPartitions); } return input.groupByKey(); }
public static <K, V> JavaPairRDD<K, V> intersectByKey(JavaPairRDD<K, V> rdd1, JavaPairRDD<K, V> rdd2) { JavaPairRDD<K, Tuple2<Iterable<V>, Iterable<V>>> grouped = rdd1.cogroup(rdd2); return grouped.flatMapValues(new Function<Tuple2<Iterable<V>, Iterable<V>>, Iterable<V>>() { @Override public Iterable<V> call(Tuple2<Iterable<V>, Iterable<V>> input) { ArrayList<V> al = new ArrayList<V>(); if (!Iterables.isEmpty(input._1()) && !Iterables.isEmpty(input._2())) { Iterables.addAll(al, input._1()); Iterables.addAll(al, input._2()); } return al; } }); } public static void main(String[] args) throws Exception {
public Void call(JavaRDD<String> rdd) { List<String> currentIPAddresses = rdd.take(100); return null; }});
@Override public JavaPairRDD<HiveKey, BytesWritable> doTransform( JavaPairRDD<HiveKey, V> input) { return input.mapPartitionsToPair(reduceFunc); }
/** * @return an empty {@code Optional} */ public static <T> Optional<T> absent() { return empty(); }
public Void call(JavaPairRDD<Integer, Long> rdd) { currentResponseCodeCounts = rdd.take(100); return null; }});
/** * @param value non-null value to wrap * @return {@code Optional} wrapping this value * @throws NullPointerException if value is null */ public static <T> Optional<T> of(T value) { return new Optional<>(value); }
@Override public JavaPairRDD<HiveKey, BytesWritable> doTransform( JavaPairRDD<BytesWritable, BytesWritable> input) { return input.mapPartitionsToPair(mapFunc); }
private SparkStageInfo getStageInfo(int stageId) { return sparkContext.statusTracker().getStageInfo(stageId); } }