private static void addIDsExtension(PMML pmml, String key, JavaPairRDD<Integer,?> features, Map<Integer,String> indexToID) { List<String> ids = features.keys().collect().stream().map(indexToID::get).collect(Collectors.toList()); AppPMMLUtils.addExtensionContent(pmml, key, ids); }
public static final JavaRDD<String> filterIPAddress( JavaPairRDD<String, Long> ipAddressCount) { return ipAddressCount .filter(new IpCountGreaterThan10()) .keys(); }
/** * Computes root mean squared error of {@link Rating#rating()} versus predicted value. */ static double rmse(MatrixFactorizationModel mfModel, JavaRDD<Rating> testData) { JavaPairRDD<Tuple2<Integer,Integer>,Double> testUserProductValues = testData.mapToPair(rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating())); @SuppressWarnings("unchecked") RDD<Tuple2<Object,Object>> testUserProducts = (RDD<Tuple2<Object,Object>>) (RDD<?>) testUserProductValues.keys().rdd(); JavaRDD<Rating> predictions = testData.wrapRDD(mfModel.predict(testUserProducts)); double mse = predictions.mapToPair( rating -> new Tuple2<>(new Tuple2<>(rating.user(), rating.product()), rating.rating()) ).join(testUserProductValues).values().mapToDouble(valuePrediction -> { double diff = valuePrediction._1() - valuePrediction._2(); return diff * diff; }).mean(); return Math.sqrt(mse); }
List<String> clientList = pairs.keys().distinct().collect(); Queue<ClientDetail> clientDetailQueue = new PriorityQueue<ClientDetail>(30, new Comparator<ClientDetail>() {
@Override public MStream<T> keys() { return new SparkStream<>(rdd.keys()); }
@Override public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData( Optional<String> lastCheckpointStr, long sourceLimit) { try { // find the source commit to pull Optional<String> commitToPull = findCommitToPull(lastCheckpointStr); if (!commitToPull.isPresent()) { return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.orElse("")); } // read the files out. List<FileStatus> commitDeltaFiles = Arrays.asList( fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()) .collect(Collectors.joining(",")); JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get())); } catch (IOException ioe) { throw new HoodieIOException( "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } } }
@Override public Pair<Optional<JavaRDD<GenericRecord>>, String> fetchNewData( Optional<String> lastCheckpointStr, long sourceLimit) { try { // find the source commit to pull Optional<String> commitToPull = findCommitToPull(lastCheckpointStr); if (!commitToPull.isPresent()) { return new ImmutablePair<>(Optional.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : ""); } // read the files out. List<FileStatus> commitDeltaFiles = Arrays.asList( fs.listStatus(new Path(incrPullRootPath, commitToPull.get()))); String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()) .collect(Collectors.joining(",")); JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return new ImmutablePair<>(Optional.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get())); } catch (IOException ioe) { throw new HoodieIOException( "Unable to read from source from checkpoint: " + lastCheckpointStr, ioe); } } }
@Override protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) { JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return avroRDD.keys().map(r -> ((GenericRecord) r.datum())); } }
@Override protected JavaRDD<GenericRecord> fromFiles(AvroConvertor convertor, String pathStr) { JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration()); return avroRDD.keys().map(r -> ((GenericRecord) r.datum())); } }
private JavaRDD<Tuple2<IndexedKey,Tuple>> handleSecondarySort( RDD<Tuple> rdd, POGlobalRearrangeSpark op, int parallelism) { RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(), SparkUtil.<Tuple, Object>getTuple2Manifest()); JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); //first sort the tuple by secondary key if enable useSecondaryKey sort JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions( new HashPartitioner(parallelism), new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder())); JavaRDD<Tuple> jrdd = sorted.keys(); JavaRDD<Tuple2<IndexedKey,Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op)); return jrddPair; }
private JavaRDD<Tuple2<IndexedKey, Tuple>> handleSecondarySort( RDD<Tuple> rdd, POReduceBySpark op, int parallelism) { RDD<Tuple2<Tuple, Object>> rddPair = rdd.map(new ToKeyNullValueFunction(), SparkUtil.<Tuple, Object>getTuple2Manifest()); JavaPairRDD<Tuple, Object> pairRDD = new JavaPairRDD<Tuple, Object>(rddPair, SparkUtil.getManifest(Tuple.class), SparkUtil.getManifest(Object.class)); //first sort the tuple by secondary key if enable useSecondaryKey sort JavaPairRDD<Tuple, Object> sorted = pairRDD.repartitionAndSortWithinPartitions( new HashPartitioner(parallelism), new PigSecondaryKeyComparatorSpark(op.getSecondarySortOrder())); JavaRDD<Tuple> jrdd = sorted.keys(); JavaRDD<Tuple2<IndexedKey, Tuple>> jrddPair = jrdd.map(new ToKeyValueFunction(op)); return jrddPair; }