/** * @param trainPointData data to cluster * @param model trained KMeans Model * @return map of ClusterId, count of points associated with the clusterId */ private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData, KMeansModel model) { return trainPointData.map(model::predict).countByValue(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) { return parsedRDD.map(data -> { try { return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data)); throw e; } }); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> result = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) { return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values() .map(new Function<Text, String[]>() { @Override public String[] call(Text text) throws Exception { String s = Bytes.toString(text.getBytes(), 0, text.getLength()); return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1); } }); }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)) .cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> squared = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); JavaRDD<Integer> result = squared.filter( new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }}); System.out.println(StringUtils.join(result.collect(), ",")); } }
@Test public void map() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); JavaDoubleRDD doubles = rdd.mapToDouble(Integer::doubleValue).cache(); doubles.collect(); JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x)).cache(); pairs.collect(); JavaRDD<String> strings = rdd.map(Object::toString).cache(); strings.collect(); }
private static Map<String,Integer> buildIDIndexOneWayMap(PMML model, JavaRDD<String[]> parsedTestRDD, boolean user) { // Add to mapping everything from the model List<String> ids = AppPMMLUtils.getExtensionContent(model, user ? "XIDs" : "YIDs"); Map<String,Integer> idIndex = new HashMap<>(ids.size()); int index = 0; for (String id : ids) { idIndex.put(id, index++); } // And from the test set, which may have a few more IDs int offset = user ? 0 : 1; for (String id : parsedTestRDD.map(tokens -> tokens[offset]).distinct().collect()) { if (!idIndex.containsKey(id)) { idIndex.put(id, index++); } } return idIndex; }
public static void main(String[] args) throws Exception { if (args.length != 3) { throw new Exception("Usage LoadHive sparkMaster tbl"); } String master = args[0]; String tbl = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS")); SQLContext sqlCtx = new SQLContext(sc); DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src"); JavaRDD<Integer> squaredKeys = rdd.toJavaRDD().map(new SquareKey()); List<Integer> result = squaredKeys.collect(); for (Integer elem : result) { System.out.println(elem); } } }
/** * Get the training data - a JavaRDD<DataSet> * Note that this approach for getting training data is a special case for this example (modelling characters), and * should not be taken as best practice for loading data (like CSV etc) in general. */ public static JavaRDD<DataSet> getTrainingData(JavaSparkContext sc) throws IOException { //Get data. For the sake of this example, we are doing the following operations: // File -> String -> List<String> (split into length "sequenceLength" characters) -> JavaRDD<String> -> JavaRDD<DataSet> List<String> list = getShakespeareAsList(exampleLength); JavaRDD<String> rawStrings = sc.parallelize(list); Broadcast<Map<Character, Integer>> bcCharToInt = sc.broadcast(CHAR_TO_INT); return rawStrings.map(new StringToDataSetFn(bcCharToInt)); }
private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD, boolean user) { int offset = user ? 0 : 1; Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset]) .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions()) .zipWithIndex().mapValues(Long::intValue) .collectAsMap(); // Clone, due to some serialization problems with the result of collectAsMap? return new HashMap<>(reverseIDLookup); }
private static JavaRDD<Vector> getRddOfVectors() { List<double[]> points = Arrays.asList(new double[][] { {1.0, 0.0}, {2.0, -2.0}, {2.0, 0.0}, {-2.0, 0.0}, {-0.5, -1.0}, {-0.5, 1.0} }); return getJavaSparkContext().parallelize(points).map(Vectors::dense); }
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
@Test public void testAsyncActionErrorWrapping() throws Exception { List<Integer> data = Arrays.asList(1, 2, 3, 4, 5); JavaRDD<Integer> rdd = sc.parallelize(data, 1); JavaFutureAction<Long> future = rdd.map(new BuggyMapFunction<>()).countAsync(); try { future.get(2, TimeUnit.SECONDS); fail("Expected future.get() for failed job to throw ExcecutionException"); } catch (ExecutionException ee) { assertTrue(Throwables.getStackTraceAsString(ee).contains("Custom exception!")); } assertTrue(future.isDone()); }
@Test public void testAsyncActionErrorWrapping() throws Exception { List<Integer> data = Arrays.asList(1, 2, 3, 4, 5); JavaRDD<Integer> rdd = sc.parallelize(data, 1); JavaFutureAction<Long> future = rdd.map(new BuggyMapFunction<>()).countAsync(); try { future.get(2, TimeUnit.SECONDS); fail("Expected future.get() for failed job to throw ExcecutionException"); } catch (ExecutionException ee) { assertTrue(Throwables.getStackTraceAsString(ee).contains("Custom exception!")); } assertTrue(future.isDone()); }
@Test public void testAsyncActionErrorWrapping() throws Exception { List<Integer> data = Arrays.asList(1, 2, 3, 4, 5); JavaRDD<Integer> rdd = sc.parallelize(data, 1); JavaFutureAction<Long> future = rdd.map(new BuggyMapFunction<>()).countAsync(); try { future.get(2, TimeUnit.SECONDS); fail("Expected future.get() for failed job to throw ExcecutionException"); } catch (ExecutionException ee) { assertTrue(Throwables.getStackTraceAsString(ee).contains("Custom exception!")); } assertTrue(future.isDone()); }