public static Map<String,Integer> countDistinctOtherWords(JavaPairRDD<String,String> data) { return data.values().flatMapToPair(line -> { Set<String> distinctTokens = new HashSet<>(Arrays.asList(line.split(" "))); return distinctTokens.stream().flatMap(a -> distinctTokens.stream().filter(b -> !a.equals(b)).map(b -> new Tuple2<>(a, b)) ).iterator(); }).distinct().mapValues(a -> 1).reduceByKey((c1, c2) -> c1 + c2).collectAsMap(); }
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) { return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values() .map(new Function<Text, String[]>() { @Override public String[] call(Text text) throws Exception { String s = Bytes.toString(text.getBytes(), 0, text.getLength()); return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1); } }); }
@Override public Iterable<String> buildUpdates(JavaPairRDD<String,String> newData) { return newData.values().collect(); }
@Override double evaluate(JavaRDD<Vector> evalData) { return fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getSumSquaredDist).sum(); }
/** * @param evalData data for evaluation * @return the Dunn Index of a given clustering * (https://en.wikipedia.org/wiki/Cluster_analysis#Internal_evaluation); higher is better */ @Override double evaluate(JavaRDD<Vector> evalData) { // Intra-cluster distance is mean distance to centroid double maxIntraClusterDistance = fetchClusterMetrics(evalData).values().mapToDouble(ClusterMetric::getMeanDist).max(); // Inter-cluster distance is distance between centroids double minInterClusterDistance = Double.POSITIVE_INFINITY; List<ClusterInfo> clusters = new ArrayList<>(getClustersByID().values()); DistanceFn<double[]> distanceFn = getDistanceFn(); for (int i = 0; i < clusters.size(); i++) { double[] centerI = clusters.get(i).getCenter(); // Distances are symmetric, hence d(i,j) == d(j,i) for (int j = i + 1; j < clusters.size(); j++) { double[] centerJ = clusters.get(j).getCenter(); minInterClusterDistance = Math.min(minInterClusterDistance, distanceFn.applyAsDouble(centerI, centerJ)); } } return minInterClusterDistance / maxIntraClusterDistance; }
return timestampRatingRDD.sortByKey().values();
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
@Test public void zipWithUniqueId() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithUniqueId(); JavaRDD<Long> indexes = zip.values(); assertEquals(4, new HashSet<>(indexes.collect()).size()); }
@Test public void zipWithUniqueId() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithUniqueId(); JavaRDD<Long> indexes = zip.values(); assertEquals(4, new HashSet<>(indexes.collect()).size()); }
@Test public void zipWithUniqueId() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithUniqueId(); JavaRDD<Long> indexes = zip.values(); assertEquals(4, new HashSet<>(indexes.collect()).size()); }
return newData.values().map(MLFunctions.PARSE_FN).mapToPair(data -> { try { double[] featureVector = KMeansUtils.featuresFromTokens(data, inputSchema);
JavaRDD<Example> examplesRDD = newData.values().map(MLFunctions.PARSE_FN). map(data -> ExampleUtils.dataToExample(data, inputSchema, valueEncodings));
@Test public void zipWithIndex() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex(); JavaRDD<Long> indexes = zip.values(); List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L); assertEquals(correctIndexes, indexes.collect()); }
@Test public void zipWithIndex() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex(); JavaRDD<Long> indexes = zip.values(); List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L); assertEquals(correctIndexes, indexes.collect()); }
@Test public void zipWithIndex() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex(); JavaRDD<Long> indexes = zip.values(); List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L); assertEquals(correctIndexes, indexes.collect()); }
newData.values().sortBy(MLFunctions.TO_TIMESTAMP_FN, true, newData.partitions().size()); JavaPairRDD<Tuple2<String,String>,Double> tuples = sortedValues.mapToPair(line -> { try {
String distScriptName = "finddistance.R"; sc.addFile(distScript); JavaRDD<String> pipeInputs = contactsContactLists.values().map(new VerifyCallLogs()).flatMap( new FlatMapFunction<CallLog[], String>() { public Iterable<String> call(CallLog[] calls) { ArrayList<String> latLons = new ArrayList<String>();
final JavaRDD<Traverser.Admin<Object>> nextRDD = inputRDD.values()