/** * @return the number of clusters computed from the points until now. */ public int getNumClusters() { return centroids.size(); }
/** * @return the number of clusters computed from the points until now. */ public int getNumClusters() { return centroids.size(); }
/** * @return the number of clusters computed from the points until now. */ public int getNumClusters() { return centroids.size(); }
public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations, double trimFraction, boolean kMeansPlusPlusInit, boolean correctWeights, double testProbability, int numRuns) { Preconditions.checkArgument(searcher.size() == 0, "Searcher must be empty initially to populate with centroids"); Preconditions.checkArgument(numClusters > 0, "The requested number of clusters must be positive"); Preconditions.checkArgument(maxNumIterations > 0, "The maximum number of iterations must be positive"); Preconditions.checkArgument(trimFraction > 0, "The trim fraction must be positive"); Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "The testProbability must be in [0, 1)"); Preconditions.checkArgument(numRuns > 0, "There has to be at least one run"); this.centroids = searcher; this.numClusters = numClusters; this.maxNumIterations = maxNumIterations; this.trimFraction = trimFraction; this.kMeansPlusPlusInit = kMeansPlusPlusInit; this.correctWeights = correctWeights; this.testProbability = testProbability; this.splitTrainTest = testProbability > 0; this.numRuns = numRuns; this.random = RandomUtils.getRandom(); }
public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations, double trimFraction, boolean kMeansPlusPlusInit, boolean correctWeights, double testProbability, int numRuns) { Preconditions.checkArgument(searcher.size() == 0, "Searcher must be empty initially to populate with centroids"); Preconditions.checkArgument(numClusters > 0, "The requested number of clusters must be positive"); Preconditions.checkArgument(maxNumIterations > 0, "The maximum number of iterations must be positive"); Preconditions.checkArgument(trimFraction > 0, "The trim fraction must be positive"); Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "The testProbability must be in [0, 1)"); Preconditions.checkArgument(numRuns > 0, "There has to be at least one run"); this.centroids = searcher; this.numClusters = numClusters; this.maxNumIterations = maxNumIterations; this.trimFraction = trimFraction; this.kMeansPlusPlusInit = kMeansPlusPlusInit; this.correctWeights = correctWeights; this.testProbability = testProbability; this.splitTrainTest = testProbability > 0; this.numRuns = numRuns; this.random = RandomUtils.getRandom(); }
public BallKMeans(UpdatableSearcher searcher, int numClusters, int maxNumIterations, double trimFraction, boolean kMeansPlusPlusInit, boolean correctWeights, double testProbability, int numRuns) { Preconditions.checkArgument(searcher.size() == 0, "Searcher must be empty initially to populate with centroids"); Preconditions.checkArgument(numClusters > 0, "The requested number of clusters must be positive"); Preconditions.checkArgument(maxNumIterations > 0, "The maximum number of iterations must be positive"); Preconditions.checkArgument(trimFraction > 0, "The trim fraction must be positive"); Preconditions.checkArgument(testProbability >= 0 && testProbability < 1, "The testProbability must be in [0, 1)"); Preconditions.checkArgument(numRuns > 0, "There has to be at least one run"); this.centroids = searcher; this.numClusters = numClusters; this.maxNumIterations = maxNumIterations; this.trimFraction = trimFraction; this.kMeansPlusPlusInit = kMeansPlusPlusInit; this.correctWeights = correctWeights; this.testProbability = testProbability; this.splitTrainTest = testProbability > 0; this.numRuns = numRuns; this.random = RandomUtils.getRandom(); }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = Lists.newArrayList(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = Lists.newArrayList(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = new ArrayList<>(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
while (centroids.size() < numClusters) {
while (centroids.size() < numClusters) {
while (centroids.size() < numClusters) {
@Test public void testExactMatch() { searcher.clear(); Iterable<MatrixSlice> data = dataPoints; final Iterable<MatrixSlice> batch1 = Iterables.limit(data, 300); List<MatrixSlice> queries = Lists.newArrayList(Iterables.limit(batch1, 100)); // adding the data in multiple batches triggers special code in some searchers searcher.addAllMatrixSlices(batch1); assertEquals(300, searcher.size()); Vector q = Iterables.get(data, 0).vector(); List<WeightedThing<Vector>> r = searcher.search(q, 2); assertEquals(0, r.get(0).getValue().minus(q).norm(1), 1.0e-8); final Iterable<MatrixSlice> batch2 = Iterables.limit(Iterables.skip(data, 300), 10); searcher.addAllMatrixSlices(batch2); assertEquals(310, searcher.size()); q = Iterables.get(data, 302).vector(); r = searcher.search(q, 2); assertEquals(0, r.get(0).getValue().minus(q).norm(1), 1.0e-8); searcher.addAllMatrixSlices(Iterables.skip(data, 310)); assertEquals(dataPoints.numRows(), searcher.size()); for (MatrixSlice query : queries) { r = searcher.search(query.vector(), 2); assertEquals("Distance has to be about zero", 0, r.get(0).getWeight(), 1.0e-6); assertEquals("Answer must be substantially the same as query", 0, r.get(0).getValue().minus(query.vector()).norm(1), 1.0e-8); assertTrue("Wrong answer must have non-zero distance", r.get(1).getWeight() > r.get(0).getWeight()); } }