private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) { Objects.requireNonNull(newData); if (testFraction <= 0.0) { return new Pair<>(pastData == null ? newData : newData.union(pastData), null); } if (testFraction >= 1.0) { return new Pair<>(pastData, newData); } if (empty(newData)) { return new Pair<>(pastData, null); } Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData); JavaRDD<M> newTrainData = newTrainTest.getFirst(); return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData), newTrainTest.getSecond()); }
KMeansPMMLUtils.validatePMMLVsSchema(model, inputSchema); JavaRDD<Vector> evalData = parsedToVectorRDD(trainData.union(testData).map(MLFunctions.PARSE_FN)); List<ClusterInfo> clusterInfoList = KMeansPMMLUtils.read(model);
JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples);
@Override public void publishAdditionalModelData(JavaSparkContext sparkContext, PMML pmml, JavaRDD<String> newData, JavaRDD<String> pastData, Path modelParentPath, TopicProducer<String, String> modelUpdateTopic) { // Send item updates first, before users. That way, user-based endpoints like /recommend // may take longer to not return 404, but when they do, the result will be more complete. log.info("Sending item / Y data as model updates"); String yPathString = AppPMMLUtils.getExtensionValue(pmml, "Y"); JavaPairRDD<String,float[]> productRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, yPathString)); String updateBroker = modelUpdateTopic.getUpdateBroker(); String topic = modelUpdateTopic.getTopic(); // For now, there is no use in sending known users for each item productRDD.foreachPartition(new EnqueueFeatureVecsFn("Y", updateBroker, topic)); log.info("Sending user / X data as model updates"); String xPathString = AppPMMLUtils.getExtensionValue(pmml, "X"); JavaPairRDD<String,float[]> userRDD = readFeaturesRDD(sparkContext, new Path(modelParentPath, xPathString)); if (noKnownItems) { userRDD.foreachPartition(new EnqueueFeatureVecsFn("X", updateBroker, topic)); } else { log.info("Sending known item data with model updates"); JavaRDD<String[]> allData = (pastData == null ? newData : newData.union(pastData)).map(MLFunctions.PARSE_FN); JavaPairRDD<String,Collection<String>> knownItems = knownsRDD(allData, true); userRDD.join(knownItems).foreachPartition( new EnqueueFeatureVecsAndKnownItemsFn("X", updateBroker, topic)); } }
JavaRDD<LabeledPoint> trainingData = spamTrainingData.union(nonSpamTrainingData);
@Override @SuppressWarnings("unchecked") public JavaRDD<?> translate(Union operator, SparkExecutorContext context) { final List<JavaRDD<?>> inputs = context.getInputs(operator); if (inputs.size() < 2) { throw new IllegalStateException("Union operator needs at least 2 inputs"); } return inputs .stream() .reduce( (l, r) -> ((JavaRDD<Object>) l) .union((JavaRDD<Object>) r) .setName(operator.getName())) .orElseThrow(() -> new IllegalArgumentException("Unable to reduce inputs.")); } }
@SuppressWarnings("unchecked") @Override public SparkCollection<T> union(SparkCollection<T> other) { return wrap(rdd.union((JavaRDD<T>) other.getUnderlying())); }
@Override public JavaRDD<T> toRDD(JavaSparkContext sc) { return this.one.toRDD(sc).union(this.two.toRDD(sc)); }
@Override public SparkStream<T> union(@NonNull MStream<T> other) { if (other.isReusable() && other.isEmpty()) { return this; } else if (isReusable() && this.isEmpty()) { return new SparkStream<>(other); } else if (other instanceof SparkStream) { return new SparkStream<>(rdd.union(Cast.<SparkStream<T>>as(other).rdd)); } SparkStream<T> stream = new SparkStream<>(other); return new SparkStream<>(rdd.union(stream.rdd)); }
private Pair<JavaRDD<M>,JavaRDD<M>> splitTrainTest(JavaRDD<M> newData, JavaRDD<M> pastData) { Objects.requireNonNull(newData); if (testFraction <= 0.0) { return new Pair<>(pastData == null ? newData : newData.union(pastData), null); } if (testFraction >= 1.0) { return new Pair<>(pastData, newData); } if (empty(newData)) { return new Pair<>(pastData, null); } Pair<JavaRDD<M>,JavaRDD<M>> newTrainTest = splitNewDataToTrainTest(newData); JavaRDD<M> newTrainData = newTrainTest.getFirst(); return new Pair<>(pastData == null ? newTrainData : newTrainData.union(pastData), newTrainTest.getSecond()); }
PairFunction<Integer, Integer, Integer> mapToTuple = (PairFunction<Integer, Integer, Integer>) i -> new Tuple2<>(i, i); return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);
PairFunction<Integer, Integer, Integer> mapToTuple = (PairFunction<Integer, Integer, Integer>) i -> new Tuple2<>(i, i); return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3);
PairFunction<Integer, Integer, Integer> mapToTuple = (Integer i) -> new Tuple2<>(i, i); return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3); }); JavaTestUtils.attachTestOutputStream(transformed2);
PairFunction<Integer, Integer, Integer> mapToTuple = (Integer i) -> new Tuple2<>(i, i); return rdd1.union(rdd2).mapToPair(mapToTuple).join(prdd3); }); JavaTestUtils.attachTestOutputStream(transformed2);
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == this.getNumInputs(); assert outputs.length == this.getNumOutputs(); RddChannel.Instance input0 = (RddChannel.Instance) inputs[0]; RddChannel.Instance input1 = (RddChannel.Instance) inputs[1]; RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final JavaRDD<Type> inputRdd0 = input0.provideRdd(); final JavaRDD<Type> inputRdd1 = input1.provideRdd(); final JavaRDD<Type> outputRdd = inputRdd0.union(inputRdd1); this.name(outputRdd); output.accept(outputRdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }