private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); DistanceFn<double[]> distanceFn = getDistanceFn(); Map<Integer,ClusterInfo> clusters = getClustersByID(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, vec); }).groupByKey(); }
/** * @param evalData points to cluster for evaluation * @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid, * and sum of squared distances */ JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist)); }).reduceByKey(ClusterMetric::add); }
mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (int i = 0; i < trees.length; i++) { DecisionTreeModel tree = trees[i];
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
private ClusteringModel pmmlClusteringModel(KMeansModel model, Map<Integer,Long> clusterSizesMap) { Vector[] clusterCenters = model.clusterCenters(); List<ClusteringField> clusteringFields = new ArrayList<>(); for (int i = 0; i < inputSchema.getNumFeatures(); i++) { if (inputSchema.isActive(i)) { FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i)); ClusteringField clusteringField = new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE); clusteringFields.add(clusteringField); } } List<Cluster> clusters = new ArrayList<>(clusterCenters.length); for (int i = 0; i < clusterCenters.length; i++) { clusters.add(new Cluster().setId(Integer.toString(i)) .setSize(clusterSizesMap.get(i).intValue()) .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray()))); } return new ClusteringModel( MiningFunction.CLUSTERING, ClusteringModel.ModelClass.CENTER_BASED, clusters.size(), AppPMMLUtils.buildMiningSchema(inputSchema), new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE).setMeasure(new SquaredEuclidean()), clusteringFields, clusters); }
static void debug(String record, Vector v) { THE_LOGGER.info("DEBUG started:"); double[] d = v.toArray(); StringBuilder builder = new StringBuilder(); builder.append("DEBUG[record="); builder.append(record); builder.append("]:"); for (int i=0; i < d.length; i++){ builder.append("\t"); builder.append(d[i]); } THE_LOGGER.info(builder.toString()); }
static void debug(String record, Vector v) { THE_LOGGER.info("DEBUG started:"); double[] d = v.toArray(); StringBuilder builder = new StringBuilder(); builder.append("DEBUG[record="); builder.append(record); builder.append("]:"); for (int i=0; i < d.length; i++){ builder.append("\t"); builder.append(d[i]); } THE_LOGGER.info(builder.toString()); }
@Override public StandardScalerModelInfo getModelInfo(final StandardScalerModel from, final DataFrame df) { final StandardScalerModelInfo modelInfo = new StandardScalerModelInfo(); modelInfo.setMean(from.mean().toArray()); modelInfo.setStd(from.std().toArray()); modelInfo.setWithMean(from.getWithMean()); modelInfo.setWithStd(from.getWithStd()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Override public MinMaxScalerModelInfo getModelInfo(final MinMaxScalerModel from, final DataFrame df) { final MinMaxScalerModelInfo modelInfo = new MinMaxScalerModelInfo(); modelInfo.setOriginalMax(from.originalMax().toArray()); modelInfo.setOriginalMin(from.originalMin().toArray()); modelInfo.setMax(from.getMax()); modelInfo.setMin(from.getMin()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(from.getInputCol()); modelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(from.getOutputCol()); modelInfo.setOutputKeys(outputKeys); return modelInfo; }
@Test public void sparseArrayConstruction() { @SuppressWarnings("unchecked") Vector v = Vectors.sparse(3, Arrays.asList( new Tuple2<>(0, 2.0), new Tuple2<>(2, 3.0))); assertArrayEquals(new double[]{2.0, 0.0, 3.0}, v.toArray(), 0.0); } }
@Test public void sparseArrayConstruction() { @SuppressWarnings("unchecked") Vector v = Vectors.sparse(3, Arrays.asList( new Tuple2<>(0, 2.0), new Tuple2<>(2, 3.0))); assertArrayEquals(new double[]{2.0, 0.0, 3.0}, v.toArray(), 0.0); } }
@Test public void sparseArrayConstruction() { @SuppressWarnings("unchecked") Vector v = Vectors.sparse(3, Arrays.asList( new Tuple2<>(0, 2.0), new Tuple2<>(2, 3.0))); assertArrayEquals(new double[]{2.0, 0.0, 3.0}, v.toArray(), 0.0); } }
@Override public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) { final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo(); logisticRegressionModelInfo.setWeights(sparkLRModel.weights().toArray()); logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept()); logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses()); logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures()); logisticRegressionModelInfo.setThreshold((double) sparkLRModel.getThreshold().get()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add("features"); logisticRegressionModelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add("prediction"); outputKeys.add("probability"); logisticRegressionModelInfo.setOutputKeys(outputKeys); return logisticRegressionModelInfo; }
@Override public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel) { final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo(); logisticRegressionModelInfo.setWeights(sparkLRModel.weights().toArray()); logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept()); logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses()); logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures()); logisticRegressionModelInfo.setThreshold((double) sparkLRModel.getThreshold().get()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add("features"); logisticRegressionModelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add("prediction"); outputKeys.add("probability"); logisticRegressionModelInfo.setOutputKeys(outputKeys); return logisticRegressionModelInfo; }
@Test public void denseArrayConstruction() { Vector v = Vectors.dense(1.0, 2.0, 3.0); assertArrayEquals(new double[]{1.0, 2.0, 3.0}, v.toArray(), 0.0); }
@Test public void denseArrayConstruction() { Vector v = Vectors.dense(1.0, 2.0, 3.0); assertArrayEquals(new double[]{1.0, 2.0, 3.0}, v.toArray(), 0.0); }
@Test public void denseArrayConstruction() { Vector v = Vectors.dense(1.0, 2.0, 3.0); assertArrayEquals(new double[]{1.0, 2.0, 3.0}, v.toArray(), 0.0); }
@Override public LogisticRegressionModelInfo getModelInfo(final LogisticRegressionModel sparkLRModel, DataFrame df) { final LogisticRegressionModelInfo logisticRegressionModelInfo = new LogisticRegressionModelInfo(); logisticRegressionModelInfo.setWeights(sparkLRModel.coefficients().toArray()); logisticRegressionModelInfo.setIntercept(sparkLRModel.intercept()); logisticRegressionModelInfo.setNumClasses(sparkLRModel.numClasses()); logisticRegressionModelInfo.setNumFeatures(sparkLRModel.numFeatures()); logisticRegressionModelInfo.setThreshold(sparkLRModel.getThreshold()); logisticRegressionModelInfo.setProbabilityKey(sparkLRModel.getProbabilityCol()); Set<String> inputKeys = new LinkedHashSet<String>(); inputKeys.add(sparkLRModel.getFeaturesCol()); logisticRegressionModelInfo.setInputKeys(inputKeys); Set<String> outputKeys = new LinkedHashSet<String>(); outputKeys.add(sparkLRModel.getPredictionCol()); outputKeys.add(sparkLRModel.getProbabilityCol()); logisticRegressionModelInfo.setOutputKeys(outputKeys); return logisticRegressionModelInfo; }
@Test public void twoDimensionalData() { JavaRDD<Vector> points = jsc.parallelize(Arrays.asList( Vectors.dense(4, -1), Vectors.dense(4, 1), Vectors.sparse(2, new int[]{0}, new double[]{1.0}) ), 2); BisectingKMeans bkm = new BisectingKMeans() .setK(4) .setMaxIterations(2) .setSeed(1L); BisectingKMeansModel model = bkm.run(points); Assert.assertEquals(3, model.k()); Assert.assertArrayEquals(new double[]{3.0, 0.0}, model.root().center().toArray(), 1e-12); for (ClusteringTreeNode child : model.root().children()) { double[] center = child.center().toArray(); if (center[0] > 2) { Assert.assertEquals(2, child.size()); Assert.assertArrayEquals(new double[]{4.0, 0.0}, center, 1e-12); } else { Assert.assertEquals(1, child.size()); Assert.assertArrayEquals(new double[]{1.0, 0.0}, center, 1e-12); } } } }
@Test public void twoDimensionalData() { JavaRDD<Vector> points = jsc.parallelize(Arrays.asList( Vectors.dense(4, -1), Vectors.dense(4, 1), Vectors.sparse(2, new int[]{0}, new double[]{1.0}) ), 2); BisectingKMeans bkm = new BisectingKMeans() .setK(4) .setMaxIterations(2) .setSeed(1L); BisectingKMeansModel model = bkm.run(points); Assert.assertEquals(3, model.k()); Assert.assertArrayEquals(new double[]{3.0, 0.0}, model.root().center().toArray(), 1e-12); for (ClusteringTreeNode child : model.root().children()) { double[] center = child.center().toArray(); if (center[0] > 2) { Assert.assertEquals(2, child.size()); Assert.assertArrayEquals(new double[]{4.0, 0.0}, center, 1e-12); } else { Assert.assertEquals(1, child.size()); Assert.assertArrayEquals(new double[]{1.0, 0.0}, center, 1e-12); } } } }