public DistanceMeasure getDistanceMeasure() { return centroids.getDistanceMeasure(); } }
/** * @return the number of clusters computed from the points until now. */ public int getNumClusters() { return centroids.size(); }
/** * @return an Iterator to the Centroids contained in this clusterer. */ @Override public Iterator<Centroid> iterator() { return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() { @Override public Centroid apply(Vector input) { return (Centroid)input; } }); }
centroids.clear(); numProcessedDatapoints = 0; if (centroids.size() == 0) { centroids.add(datapointsIterator.next().clone()); ++numProcessedDatapoints; WeightedThing<Vector> closestPair = centroids.searchFirst(row, false); if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) { centroids.add(row.clone()); } else { if (!centroids.remove(centroid, Constants.EPSILON)) { throw new RuntimeException("Unable to remove centroid"); centroids.add(centroid); if (!collapseClusters && centroids.size() > clusterOvershoot * numClusters) { numClusters = (int) Math.max(numClusters, clusterLogFactor * Math.log(numProcessedDatapoints)); if (centroids.size() > numClusters) { distanceCutoff *= beta;
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); for (Vector center : centroids) { Vector closestOtherCluster = centroids.searchFirst(center, true).getValue(); closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster)); WeightedVector datapoint = datapoints.get(j); WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false); int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex(); double closestDistance = closestPair.getWeight(); centroids.clear(); centroids.addAll(newCentroids); Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue(); closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());
@Test public void testRemove() { searcher.clear(); for (int i = 0; i < dataPoints.rowSize(); ++i) { Vector datapoint = dataPoints.viewRow(i); searcher.add(datapoint); // As long as points are not searched for right after being added, in FastProjectionSearch, points are not // merged with the main list right away, so if a search for a point occurs before it's merged the pendingAdditions // list also needs to be looked at. // This used to not be the case for searchFirst(), thereby causing removal failures. if (i % 2 == 0) { assertTrue("Failed to find self [search]", searcher.search(datapoint, 1).get(0).getWeight() < Constants.EPSILON); assertTrue("Failed to find self [searchFirst]", searcher.searchFirst(datapoint, false).getWeight() < Constants.EPSILON); assertTrue("Failed to remove self", searcher.remove(datapoint, Constants.EPSILON)); } } } }
@Test public void testRemoval() { searcher.clear(); searcher.addAllMatrixSlices(dataPoints); int size0 = searcher.size(); List<WeightedThing<Vector>> r0 = searcher.search(x.get(0), 2); searcher.remove(x.get(0), 1.0e-7); assertEquals(size0 - 1, searcher.size()); List<WeightedThing<Vector>> r = searcher.search(x.get(0), 1); assertTrue("Vector should be gone", r.get(0).getWeight() > 0); assertEquals("Previous second neighbor should be first", 0, r.get(0).getValue().minus(r0.get(1).getValue()).norm (1), 1.0e-8); searcher.remove(x.get(1), 1.0e-7); assertEquals(size0 - 2, searcher.size()); r = searcher.search(x.get(1), 1); assertTrue("Vector should be gone", r.get(0).getWeight() > 0); try { List<Vector> x = Lists.newArrayList(Iterables.limit(searcher, 2)); searcher.remove(x.get(0), 1.0e-7); fail("Shouldn't be able to delete from " + searcher.getClass().getName()); } catch (UnsupportedOperationException e) {
@Test public void testSearchFirst() { searcher.clear(); searcher.addAll(dataPoints); for (Vector datapoint : dataPoints) { WeightedThing<Vector> first = searcher.searchFirst(datapoint, false); WeightedThing<Vector> second = searcher.searchFirst(datapoint, true); List<WeightedThing<Vector>> firstTwo = searcher.search(datapoint, 2); assertEquals("First isn't self", 0, first.getWeight(), 0); assertEquals("First isn't self", datapoint, first.getValue()); assertEquals("First doesn't match", first, firstTwo.get(0)); assertEquals("Second doesn't match", second, firstTwo.get(1)); } }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = Lists.newArrayList(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
double bestCost = Double.POSITIVE_INFINITY; for (int i = 0; i < numRuns; ++i) { centroids.clear(); if (kMeansPlusPlusInit) { centroids.clear(); centroids.addAll(bestCentroids); WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue(); closest.setWeight(closest.getWeight() + testDatapoint.getWeight());
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); for (WeightedVector row : datapoints) { deltaX += distanceMeasure.distance(row, center); centroids.add(c_1); int clusterIndex = 1; while (centroids.size() < numClusters) { centroids.add(nextSeed);
@Test public void testClustering() { searcher.clear(); int numStreamingClusters = (int)Math.log(syntheticData.getFirst().size()) * (1 << NUM_DIMENSIONS); System.out.printf("k log n = %d\n", numStreamingClusters); double estimatedCutoff = ClusteringUtils.estimateDistanceCutoff(syntheticData.getFirst(), searcher.getDistanceMeasure(), 100); StreamingKMeans clusterer = new StreamingKMeans(searcher, numStreamingClusters, estimatedCutoff); System.out.printf("%s %s\n", searcher.getClass().getName(), searcher.getDistanceMeasure() .getClass().getName()); System.out.printf("Total number of clusters %d\n", clusterer.getNumClusters()); WeightedThing<Vector> v = searcher.search(mean, 1).get(0); maxWeight = Math.max(v.getWeight(), maxWeight);
@Test public void testOrdering() { searcher.clear(); Matrix queries = new DenseMatrix(100, 20); MultiNormal gen = new MultiNormal(20); for (int i = 0; i < 100; i++) { queries.viewRow(i).assign(gen.sample()); } searcher.addAllMatrixSlices(dataPoints); for (MatrixSlice query : queries) { List<WeightedThing<Vector>> r = searcher.search(query.vector(), 200); double x = 0; for (WeightedThing<Vector> thing : r) { assertTrue("Scores must be monotonic increasing", thing.getWeight() >= x); x = thing.getWeight(); } } }
@Test public void testNearMatch() { searcher.clear(); List<MatrixSlice> queries = Lists.newArrayList(Iterables.limit(dataPoints, 100)); searcher.addAllMatrixSlicesAsWeightedVectors(dataPoints); MultiNormal noise = new MultiNormal(0.01, new DenseVector(20)); for (MatrixSlice slice : queries) { Vector query = slice.vector(); final Vector epsilon = noise.sample(); List<WeightedThing<Vector>> r = searcher.search(query, 2); query = query.plus(epsilon); assertEquals("Distance has to be small", epsilon.norm(2), r.get(0).getWeight(), 1.0e-1); assertEquals("Answer must be substantially the same as query", epsilon.norm(2), r.get(0).getValue().minus(query).norm(2), 1.0e-1); assertTrue("Wrong answer must be further away", r.get(1).getWeight() > r.get(0).getWeight()); } }
@Test public void testAverageDistanceCutoff() { double avgDistanceCutoff = 0; double avgNumClusters = 0; int numTests = 1; System.out.printf("Distance cutoff for %s\n", searcher.getClass().getName()); for (int i = 0; i < numTests; ++i) { searcher.clear(); int numStreamingClusters = (int)Math.log(syntheticData.getFirst().size()) * (1 << NUM_DIMENSIONS); double distanceCutoff = 1.0e-6; double estimatedCutoff = ClusteringUtils.estimateDistanceCutoff(syntheticData.getFirst(), searcher.getDistanceMeasure(), 100); System.out.printf("[%d] Generated synthetic data [magic] %f [estimate] %f\n", i, distanceCutoff, estimatedCutoff); StreamingKMeans clusterer = new StreamingKMeans(searcher, numStreamingClusters, estimatedCutoff); clusterer.cluster(syntheticData.getFirst()); avgDistanceCutoff += clusterer.getDistanceCutoff(); avgNumClusters += clusterer.getNumClusters(); System.out.printf("[%d] %f\n", i, clusterer.getDistanceCutoff()); } avgDistanceCutoff /= numTests; avgNumClusters /= numTests; System.out.printf("Final: distanceCutoff: %f estNumClusters: %f\n", avgDistanceCutoff, avgNumClusters); }
row.assign(Functions.mult(1 / scale)); WeightedThing<Vector> cluster = clustering.search(row, 1).get(0); current.assign(cluster.getValue()); current.assign(Functions.mult(scale));
/** * Adds up the distances from each point to its closest cluster and returns the sum. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return the total cost described above. */ public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) { DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure(); UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); return totalClusterCost(datapoints, searcher); }
/** * Selects some of the original points randomly with probability proportional to their weights. This is much * less sophisticated than the kmeans++ approach, however it is faster and coupled with * * The side effect of this method is to fill the centroids structure itself. * * @param datapoints The datapoints to select from. These datapoints should be WeightedVectors of some kind. */ private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) { int numDatapoints = datapoints.size(); double totalWeight = 0; for (WeightedVector datapoint : datapoints) { totalWeight += datapoint.getWeight(); } Multinomial<Integer> seedSelector = new Multinomial<Integer>(); for (int i = 0; i < numDatapoints; ++i) { seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight); } for (int i = 0; i < numClusters; ++i) { int sample = seedSelector.sample(); seedSelector.delete(sample); Centroid centroid = new Centroid(datapoints.get(sample)); centroid.setIndex(i); centroids.add(centroid); } }
centroids.clear(); numProcessedDatapoints = 0; if (centroids.size() == 0) { centroids.add(datapointsIterator.next().clone()); ++numProcessedDatapoints; WeightedThing<Vector> closestPair = centroids.searchFirst(row, false); if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) { centroids.add(row.clone()); } else { if (!centroids.remove(centroid, Constants.EPSILON)) { throw new RuntimeException("Unable to remove centroid"); centroids.add(centroid); if (!collapseClusters && centroids.size() > clusterOvershoot * numClusters) { numClusters = (int) Math.max(numClusters, clusterLogFactor * Math.log(numProcessedDatapoints)); if (centroids.size() > numClusters) { distanceCutoff *= beta;