org.apache.spark.api.java.JavaPairRDD.keys java code examples

private static void addIDsExtension(PMML pmml,
                  String key,
                  JavaPairRDD<Integer,?> features,
                  Map<Integer,String> indexToID) {
 List<String> ids = features.keys().collect().stream().map(indexToID::get).collect(Collectors.toList());
 AppPMMLUtils.addExtensionContent(pmml, key, ids);
}

public static final JavaRDD<String> filterIPAddress(
  JavaPairRDD<String, Long> ipAddressCount) {
 return ipAddressCount
  .filter(new IpCountGreaterThan10())
  .keys();
}

/**
 * Computes root mean squared error of {@link Rating#rating()} versus predicted value.
 */
static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) {
 JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues =
   testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()));
 @SuppressWarnings("unchecked")
 RDD<Tuple2<Object,Object>> testUserProducts =
   (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd();
 JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts));
 double mse = predictions.mapToPair(
   rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())
 ).join(testUserProductValues).values().mapToDouble(valuePrediction -> {
  double diff = valuePrediction._1() - valuePrediction._2();
  return diff * diff;
 }).mean();
 return Math.sqrt(mse);
}

List<String> clientList = pairs.keys().distinct().collect();
Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {

@Override
public MStream<T> keys() {
 return new SparkStream<>(rdd.keys());
}

 @Override
 public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
   Optional<String> lastCheckpointStr, long sourceLimit) {
  try {
   // find the source commit to pull
   Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

   if (!commitToPull.isPresent()) {
    return new ImmutablePair<>(Optional.empty(),
        lastCheckpointStr.orElse(""));
   }

   // read the files out.
   List<FileStatus> commitDeltaFiles = Arrays.asList(
     fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
   String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
     .collect(Collectors.joining(","));
   JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
     AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
     sparkContext.hadoopConfiguration());
   return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
     String.valueOf(commitToPull.get()));
  } catch (IOException ioe) {
   throw new HoodieIOException(
     "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
  }
 }
}

 @Override
 public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData(
   Optional<String> lastCheckpointStr, long sourceLimit) {
  try {
   // find the source commit to pull
   Optional<String> commitToPull = findCommitToPull(lastCheckpointStr);

   if (!commitToPull.isPresent()) {
    return new ImmutablePair<>(Optional.empty(),
      lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
   }

   // read the files out.
   List<FileStatus> commitDeltaFiles = Arrays.asList(
     fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
   String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString())
     .collect(Collectors.joining(","));
   JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
     AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
     sparkContext.hadoopConfiguration());
   return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
     String.valueOf(commitToPull.get()));
  } catch (IOException ioe) {
   throw new HoodieIOException(
     "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
  }
 }
}

 @Override
 protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
  JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
    AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
    sparkContext.hadoopConfiguration());
  return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
 }
}

 @Override
 protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) {
  JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr,
    AvroKeyInputFormat.class, AvroKey.class, NullWritable.class,
    sparkContext.hadoopConfiguration());
  return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
 }
}

private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort(
    RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) {
  RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
      SparkUtil.<Tuple, Object>getTuple2Manifest());
  JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
      SparkUtil.getManifest(Tuple.class),
      SparkUtil.getManifest(Object.class));
  //first sort the tuple by secondary key if enable useSecondaryKey sort
  JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
      new HashPartitioner(parallelism),
      new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
  JavaRDD<Tuple> jrdd = sorted.keys();
  JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
  return jrddPair;
}

private JavaRDD<Tuple2<IndexedKey, Tuple>> handleSecondarySort(
    RDD<Tuple> rdd, POReduceBySpark op, int parallelism) {
  RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(),
      SparkUtil.<Tuple, Object>getTuple2Manifest());
  JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair,
      SparkUtil.getManifest(Tuple.class),
      SparkUtil.getManifest(Object.class));
  //first sort the tuple by secondary key if enable useSecondaryKey sort
  JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions(
      new HashPartitioner(parallelism),
      new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder()));
  JavaRDD<Tuple> jrdd = sorted.keys();
  JavaRDD<Tuple2<IndexedKey, Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op));
  return jrddPair;
}

  leftIndex.setName("LeftIndex").keys().map(t -> t.getBytes()[0]).distinct(4).collectAsync();
final JavaFutureAction<List<Byte>> rightFuture =
  rightIndex.setName("RightIndex").keys().map(t -> t.getBytes()[0]).distinct(
    4).collectAsync();

Popular methods of JavaPairRDD

Popular in Java

Running tasks concurrently on multiple threads
onRequestPermissionsResult (Fragment)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
setRequestProperty (URLConnection)
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
CodeWhisperer alternatives

How to use keysmethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.keys (Showing top 12 results out of 315)

How to use
keys
method
in
org.apache.spark.api.java.JavaPairRDD