@Override public LabeledPoint call(String email) { return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); } });
mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (int i = 0; i < trees.length; i++) { DecisionTreeModel tree = trees[i];
LinearRegressionWithSGD.train(JavaRDD.toRDD(trainingData), numberOfIterations, stepSize); THE_LOGGER.info("LinearRegressionModel weights: " + model.weights()); THE_LOGGER.info("LinearRegressionModel intercept: " + model.intercept()); model.save(context.sc(), builtModelPath); THE_LOGGER.info("model saved at: builtModelPath=" + builtModelPath);
@Override public Tuple2<String, Double> call(String record) { // each record has this format: // <Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i]); } // double carPricePrediction = model.predict(Vectors.dense(features)); // return new Tuple2<String, Double>(record, carPricePrediction); } });
final LinearRegressionModel model = LinearRegressionModel.load(context.sc(), savedModelPath);
private static int validatePrediction( List<LabeledPoint> validationData, LinearRegressionModel model) { int numAccurate = 0; for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); // A prediction is off if the prediction is more than 0.5 away from expected value. if (Math.abs(prediction - point.label()) <= 0.5) { numAccurate++; } } return numAccurate; }
@Override public LabeledPoint call(String email) { return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); } });
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
final LinearRegressionModel model = LinearRegressionModel.load(context.sc(), savedModelPath);
return new LabeledPoint(target, Vectors.dense(features)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data));
@Override public LabeledPoint call(String email) { return new LabeledPoint(label, tf.transform(Arrays.asList(email.split(" ")))); } });
@Override public LabeledPoint call(String record) { // record: <Price><,><Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> // tokens[0] = <Price> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length - 1]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i+1]); } // double price = Double.parseDouble(tokens[0]); return new LabeledPoint(price, Vectors.dense(features)); } });
@Override public LabeledPoint call(String record) { String[] tokens = StringUtils.split(record, ","); // 32 tokens double[] features = new double[30]; for (int i = 2; i < features.length; i++) { features[i - 2] = Double.parseDouble(tokens[i]); } // String patientID = tokens[0]; // ignore, not used String outcomeClass = tokens[1]; // B=benign, M=malignant Vector v = new DenseVector(features); if (outcomeClass.equals("B")) { return new LabeledPoint(1, v); // benign } else { return new LabeledPoint(0, v); // malignant } } });
@Override public LabeledPoint call(String record) { // 9 tokens, the last token is the classification String[] tokens = StringUtils.split(record, ","); double[] features = new double[8]; for (int i=0; i < 8; i++) { features[i] = Double.parseDouble(tokens[i]); } // // tokens[8] => classification: // class value 1 is interpreted as "tested positive for diabetes" // double classification = Double.parseDouble(tokens[8]); Vector v = new DenseVector(features); // debug(record, v); // add a classification for the training data set return new LabeledPoint(classification, v); } });
THE_LOGGER.info("training data: classification=" + classification); return new LabeledPoint(classification, vector);
@Override public LabeledPoint call(String record) { String[] tokens = StringUtils.split(record, " "); // 5 tokens double[] features = new double[4]; features[0] = getOutlook(tokens[0]); // outlook features[1] = getTemperature(tokens[1]); // temperature features[2] = getHumidity(tokens[2]); // humidity features[3] = getWind(tokens[3]); // windy // tokens[4] => classification: play=0 or not-play=1 double classification = getPlay(tokens[4]); Vector v = new DenseVector(features); debug(record, v); // add a classification for the training data set return new LabeledPoint(classification, v); } });