@Override public LabeledPoint call(String email) { return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); } });
return new LabeledPoint(label, features); }); Double prediction = model.predict(p.features()); return new Tuple2<>(prediction, p.label()); });
mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (int i = 0; i < trees.length; i++) { DecisionTreeModel tree = trees[i];
for (LabeledPoint point : trainingRDD.collect()){ for (String author : labels.keySet()){ if (labels.get(author).equals(point.label())){ actualAuthors.add(author); break;
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
@Override public LabeledPoint call(String email) { return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); } });
return new LabeledPoint(target, Vectors.dense(features)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data));
@Override public LabeledPoint call(String email) { return new LabeledPoint(label, tf.transform(Arrays.asList(email.split(" ")))); } });
@Override public LabeledPoint call(String record) { String[] tokens = StringUtils.split(record, ","); // 32 tokens double[] features = new double[30]; for (int i = 2; i < features.length; i++) { features[i - 2] = Double.parseDouble(tokens[i]); } // String patientID = tokens[0]; // ignore, not used String outcomeClass = tokens[1]; // B=benign, M=malignant Vector v = new DenseVector(features); if (outcomeClass.equals("B")) { return new LabeledPoint(1, v); // benign } else { return new LabeledPoint(0, v); // malignant } } });
@Override public LabeledPoint call(String record) { // record: <Price><,><Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> // tokens[0] = <Price> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length - 1]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i+1]); } // double price = Double.parseDouble(tokens[0]); return new LabeledPoint(price, Vectors.dense(features)); } });
@Override public LabeledPoint call(String record) { // 9 tokens, the last token is the classification String[] tokens = StringUtils.split(record, ","); double[] features = new double[8]; for (int i=0; i < 8; i++) { features[i] = Double.parseDouble(tokens[i]); } // // tokens[8] => classification: // class value 1 is interpreted as "tested positive for diabetes" // double classification = Double.parseDouble(tokens[8]); Vector v = new DenseVector(features); // debug(record, v); // add a classification for the training data set return new LabeledPoint(classification, v); } });
public static JavaRDD<LabeledPoint> filterData(JavaRDD<LabeledPoint> data, String filterString) { return data.map(new Function<LabeledPoint, LabeledPoint>() { @Override public LabeledPoint call(LabeledPoint point) throws Exception { double label = point.label(); double[] features = point.features().toArray(); String[] featuresInUse = filterString.split(","); double[] filteredFeatures = new double[featuresInUse.length]; for (int i = 0; i < featuresInUse.length; i++) { filteredFeatures[i] = features[Integer.parseInt(VectorizationProperties.getProperty(featuresInUse[i]))]; } LabeledPoint newPoint = new LabeledPoint(label, Vectors.dense(filteredFeatures)); System.out.println(newPoint); return newPoint; } }); }
THE_LOGGER.info("training data: classification=" + classification); return new LabeledPoint(classification, vector);