org.apache.spark.api.java.JavaRDD java code examples

Refine search

@Test
public void map() {
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
 JavaDoubleRDD doubles = rdd.mapToDouble(x -> 1.0 * x).cache();
 doubles.collect();
 JavaPairRDD<Integer, Integer> pairs = rdd.mapToPair(x -> new Tuple2<>(x, x))
  .cache();
 pairs.collect();
 JavaRDD<String> strings = rdd.map(Object::toString).cache();
 strings.collect();
}

System.out.println("Properties: " + System.getProperties());
SparkConf sparkConf = new SparkConf().setAppName("GroupActionsJob");
  sparkConf.setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
JavaRDD<String> dataSet = jsc.textFile(JobUtils.getSourceDirFromDate(cmdLineArgs.input_path_pattern, cmdLineArgs.input_date_string)).repartition(4);
  dataSet = dataSet.filter(clientFilter);
JavaPairRDD<String, ActionData> pairs = dataSet.mapToPair(new PairFunction<String, String, ActionData>() {
}).persist(StorageLevel.MEMORY_AND_DISK());
List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {
  JavaRDD<String> json_only = json_only_with_zeros.filter(new Function<String, Boolean>() {
    json_only.saveAsTextFile(outputPath, org.apache.hadoop.io.compress.GzipCodec.class);
  } else {
    json_only.saveAsTextFile(outputPath);
  long json_only_count = json_only.count();
  clientDetailZeroQueue.add(new ClientDetail(currentClient, json_only_with_zeros.count() - json_only_count));
  clientDetailQueue.add(new ClientDetail(currentClient, json_only_count));

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
   testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,Object>> testUserProducts =
   (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
 JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
 double mse = predictions.mapToPair(
   rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
 ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
  double diff = valuePrediction._1() - valuePrediction._2();
  return diff * diff;
 }).mean();
 return Math.sqrt(mse);
}

/**
 * @param trainPointData data to cluster
 * @param model trained KMeans Model
 * @return map of ClusterId, count of points associated with the clusterId
 */
private static Map<Integer,Long> fetchClusterCountsFromModel(JavaRDD<? extends Vector> trainPointData,
                               KMeansModel model) {
  return trainPointData.map(model::predict).countByValue();
}

static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) {
 long count = evalData.count();
 if (count > MAX_SAMPLE_SIZE) {
  return evalData.sample(false, (double) MAX_SAMPLE_SIZE / count);
 }
 return evalData;
}

private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD,
                            boolean user) {
 int offset = user ? 0 : 1;
 Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset])
   .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions())
   .zipWithIndex().mapValues(Long::intValue)
   .collectAsMap();
 // Clone, due to some serialization problems with the result of collectAsMap?
 return new HashMap<>(reverseIDLookup);
}

THE_LOGGER.info("queryInputPath=" + queryInputPath);
THE_LOGGER.info("savedModelPath=" + savedModelPath);
JavaRDD<String> query = context.textFile(queryInputPath);
final LogisticRegressionModel model = LogisticRegressionModel.load(context.sc(), savedModelPath);
JavaPairRDD<String, Double> classifications = query.mapToPair(
  new PairFunction<String, String, Double>() {
  @Override
    THE_LOGGER.info("classification="+classification);
    return new Tuple2<String, Double>(patientID, classification);
Iterable<Tuple2<String, Double>> predictions = classifications.collect();
for (Tuple2<String, Double> pair : predictions) {
  THE_LOGGER.info("query: patientID="+pair._1);
  THE_LOGGER.info("prediction="+pair._2);
context.stop();

THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
THE_LOGGER.info("--- queryDataPath=" + queryDataPath);
JavaRDD<String> queryRDD = context.textFile(queryDataPath);
JavaRDD<Vector> query = Util.createFeatureVector(queryRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
    = query.mapToPair((Vector v) -> {
      return new Tuple2<Vector, Double>(v, prediction);
    });
Iterable<Tuple2<Vector, Double>> predictions = predictionAndLabel.collect();
for (Tuple2<Vector, Double> p : predictions) {
  THE_LOGGER.info("input: " + p._1);
  THE_LOGGER.info("prediction: " + p._2);
context.close();

THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
THE_LOGGER.info("--- testDataPath=" + testDataPath);
SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel");
JavaSparkContext context = new JavaSparkContext(sparkConf);
JavaRDD<String> testRDD = context.textFile(testDataPath);        
JavaRDD<LabeledPoint> test  = Util.createLabeledPointRDD(testRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
  test.mapToPair((LabeledPoint p) -> new Tuple2<Double, Double>(model.predict(p.features()), p.label()));
double accuracy = predictionAndLabel.filter((Tuple2<Double, Double> pl) -> pl._1().equals(pl._2()))
                  .count() / (double) test.count();
THE_LOGGER.info("accuracy="+accuracy);

THE_LOGGER.info("--- trainingPath=" + trainingPath);
THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
JavaRDD<String> trainingRDD = context.textFile(trainingPath);        
JavaRDD<LabeledPoint> training  = Util.createLabeledPointRDD(trainingRDD);
final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);
model.save(context.sc(), savedModelPath);
context.close();

THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
THE_LOGGER.info("--- testDataPath=" + testDataPath);
SparkConf sparkConf = new SparkConf().setAppName("TestAccuracyOfModel");
JavaSparkContext context = new JavaSparkContext(sparkConf);
JavaRDD<String> testRDD = context.textFile(testDataPath);        
JavaRDD<LabeledPoint> test  = Util.createLabeledPointRDD(testRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
  @Override 
  public Tuple2<Double, Double> call(LabeledPoint p) {
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
  @Override 
  public Boolean call(Tuple2<Double, Double> pl) {
    return pl._1().equals(pl._2());
}).count() / (double) test.count();
THE_LOGGER.info("accuracy="+accuracy);

THE_LOGGER.info("--- testDataPath=" + testDataPath);
THE_LOGGER.info("--- savedModelPath=" + savedModelPath);
JavaRDD<String> testRDD = context.textFile(testDataPath);        
JavaRDD<LabeledPoint> test  = Util.createLabeledPointRDD(testRDD);
final NaiveBayesModel model = NaiveBayesModel.load(context.sc(), savedModelPath);
  test.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
  @Override 
  public Tuple2<Double, Double> call(LabeledPoint p) {
double accuracy = predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() {
  @Override 
  public Boolean call(Tuple2<Double, Double> pl) {
    return pl._1().equals(pl._2());
}).count() / (double) test.count();
THE_LOGGER.info("accuracy="+accuracy);
context.close();

public static void main(String[] args) {
 SparkConf sparkConf = new SparkConf().setAppName("basic log query");
 JavaSparkContext sc = new JavaSparkContext(sparkConf);
   logs  = sc.textFile(args[0]);
   logs = sc.parallelize(EXAMPLE_LOGS);
   logs.mapToPair(new PairFunction<String, Tuple3<String, String, String>, LogStatistics>() {
  @Override
  public Tuple2<Tuple3<String, String, String>, LogStatistics> call(String logRecord) {
   extracted.filter(new Function<
                  Tuple2<Tuple3<String, String, String>, LogStatistics>, 
                  Boolean
   filtered.reduceByKey(new Function2<LogStatistics, LogStatistics, LogStatistics>() {
   @Override
   public LogStatistics call(LogStatistics stats, LogStatistics stats2) {
 List<Tuple2<Tuple3<String, String, String>, LogStatistics>> output = counts.collect();
 for (Tuple2<?,?> t : output) {
   System.out.println(t._1() + "\t" + t._2());

@Test
public void flatMap() {
 JavaRDD<String> rdd = sc.parallelize(Arrays.asList("Hello World!",
  "The quick brown fox jumps over the lazy dog."));
 JavaRDD<String> words = rdd.flatMap(x -> Arrays.asList(x.split(" ")).iterator());
 Assert.assertEquals("Hello", words.first());
 Assert.assertEquals(11, words.count());
 JavaPairRDD<String, String> pairs = rdd.flatMapToPair(s -> {
  List<Tuple2<String, String>> pairs2 = new LinkedList<>();
  for (String word : s.split(" ")) {
   pairs2.add(new Tuple2<>(word, word));
  }
  return pairs2.iterator();
 });
 Assert.assertEquals(new Tuple2<>("Hello", "Hello"), pairs.first());
 Assert.assertEquals(11, pairs.count());
 JavaDoubleRDD doubles = rdd.flatMapToDouble(s -> {
  List<Double> lengths = new LinkedList<>();
  for (String word : s.split(" ")) {
   lengths.add((double) word.length());
  }
  return lengths.iterator();
 });
 Assert.assertEquals(5.0, doubles.first(), 0.01);
 Assert.assertEquals(11, pairs.count());
}

final Accumulator<Integer> count = sc.accumulator(0);
rdd.foreach(new VoidFunction<String>(){ public void call(String line) {
   if (line.contains("KK6JKQ")) {
    count.add(1);
JavaRDD<String> callSigns = rdd.flatMap(
 new FlatMapFunction<String, String>() { public Iterable<String> call(String line) {
   if (line.equals("")) {
callSigns.saveAsTextFile(outputDir + "/callsigns");
System.out.println("Blank lines: "+ blankLines.value());
JavaRDD<String> validCallSigns = callSigns.filter(
 new Function<String, Boolean>(){ public Boolean call(String callSign) {
   Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z");
JavaPairRDD<String, Integer> contactCounts = validCallSigns.mapToPair(
 new PairFunction<String, String, Integer>() {
  public Tuple2<String, Integer> call(String callSign) {
   return new Tuple2(callSign, 1);
  }}).reduceByKey(new Function2<Integer, Integer, Integer>() {
    public Integer call(Integer x, Integer y) {
     return x + y;
    }});
JavaPairRDD<String, CallLog[]> contactsContactLists = validCallSigns.mapPartitionsToPair(
 new PairFlatMapFunction<Iterator<String>, String, CallLog[]>() {
  public Iterable<Tuple2<String, CallLog[]>> call(Iterator<String> input) {

SparkConf conf = new SparkConf().setAppName("Cubing for:" + cubeName + " segment " + segmentId);
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);
JavaSparkContext sc = new JavaSparkContext(conf);
sc.sc().addSparkListener(jobListener);
HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));
SparkUtil.modifySparkHadoopConfiguration(sc.sc()); // set dfs.replication=2 and enable compress
final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
    .mapToPair(new EncodeBaseCuboid(cubeName, segmentId, metaUrl, sConf));
  totalCount = encodedBaseRDD.count();
allRDDs[0] = encodedBaseRDD.reduceByKey(baseCuboidReducerFunction, partition).persist(storageLevel);

SparkConf conf = new SparkConf();
conf.setMaster("local[*]");
conf.setAppName("DataVec Example");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> stringData = sc.textFile(filename);
JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr));
List<String> inputDataCollected = stringData.collect();
System.out.println("\n\n---- Original Data ----");
for(String s : inputDataCollected) System.out.println("'" + s + "'");
JavaRDD<String> processedAsString = parsedInputData.map(new WritablesToStringFunction(","));
List<String> inputDataParsed = processedAsString.collect();
System.out.println("\n\n---- Parsed Data ----");
for(String s : inputDataParsed) System.out.println("'" + s + "'");
processedAsString = processedData.map(new WritablesToStringFunction(","));
inputDataParsed = processedAsString.collect();
System.out.println("\n\n---- Parsed and filtered data ----");
for(String s : inputDataParsed) System.out.println(s);

    Class.forName("scala.collection.mutable.WrappedArray$ofRef") };
SparkConf conf = new SparkConf().setAppName("Merge dictionary for cube:" + cubeName + ", segment " + segmentId);
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);
try (JavaSparkContext sc = new JavaSparkContext(conf)) {
  KylinSparkJobListener jobListener = new KylinSparkJobListener();
  sc.sc().addSparkListener(jobListener);
  HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(dictOutputPath));
  final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
  JavaRDD<Integer> indexRDD = sc.parallelize(indexs, columnLength + 1);
  JavaPairRDD<Text, Text> colToDictPathRDD = indexRDD.mapToPair(new MergeDictAndStatsFunction(cubeName,
      metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));
  colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class,
      SequenceFileOutputFormat.class);

public static void main(String[] args) {
 SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample");
 JavaSparkContext sc = new JavaSparkContext(sparkConf);
 JavaRDD<String> spam = sc.textFile("files/spam.txt");
 JavaRDD<String> ham = sc.textFile("files/ham.txt");
 JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() {
  @Override public LabeledPoint call(String email) {
   return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" "))));
 JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() {
  @Override public LabeledPoint call(String email) {
   return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" "))));
 JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples);
 trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm.
 LogisticRegressionModel model = lrLearner.run(trainingData.rdd());

@Test
public void collectAsMapWithIntArrayValues() {
 // Regression test for SPARK-1040
 JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1));
 JavaPairRDD<Integer, int[]> pairRDD = rdd.mapToPair(x -> new Tuple2<>(x, new int[]{x}));
 pairRDD.collect();  // Works fine
 pairRDD.collectAsMap();  // Used to crash with ClassCastException
}

How to useJavaRDD in org.apache.spark.api.java

Best Java code snippets using org.apache.spark.api.java.JavaRDD (Showing top 20 results out of 1,314)

Refine search

How to use
JavaRDD
in
org.apache.spark.api.java