public DistanceMeasure getDistanceMeasure() { return centroids.getDistanceMeasure(); } }
public DistanceMeasure getDistanceMeasure() { return centroids.getDistanceMeasure(); } }
public DistanceMeasure getDistanceMeasure() { return centroids.getDistanceMeasure(); } }
@Override public Iterable<Centroid> call() { UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf); int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1); double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF, StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF); Iterator<Centroid> dataPointsIterator = dataPoints.iterator(); if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) { List<Centroid> estimatePoints = Lists.newArrayListWithExpectedSize(NUM_ESTIMATE_POINTS); while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) { Centroid centroid = dataPointsIterator.next(); estimatePoints.add(centroid); } if (log.isInfoEnabled()) { log.info("Estimated Points: {}", estimatePoints.size()); } estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure()); } StreamingKMeans streamingKMeans = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff); // datapointsIterator could be empty if no estimate distance was initially provided // hence creating the iterator again here for the clustering if (!dataPointsIterator.hasNext()) { dataPointsIterator = dataPoints.iterator(); } while (dataPointsIterator.hasNext()) { streamingKMeans.cluster(dataPointsIterator.next()); } streamingKMeans.reindexCentroids(); return streamingKMeans; }
@Override public Iterable<Centroid> call() { UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf); int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1); double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF, StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF); Iterator<Centroid> dataPointsIterator = dataPoints.iterator(); if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) { List<Centroid> estimatePoints = new ArrayList<>(NUM_ESTIMATE_POINTS); while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) { Centroid centroid = dataPointsIterator.next(); estimatePoints.add(centroid); } if (log.isInfoEnabled()) { log.info("Estimated Points: {}", estimatePoints.size()); } estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure()); } StreamingKMeans streamingKMeans = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff); // datapointsIterator could be empty if no estimate distance was initially provided // hence creating the iterator again here for the clustering if (!dataPointsIterator.hasNext()) { dataPointsIterator = dataPoints.iterator(); } while (dataPointsIterator.hasNext()) { streamingKMeans.cluster(dataPointsIterator.next()); } streamingKMeans.reindexCentroids(); return streamingKMeans; }
@Override public Iterable<Centroid> call() { UpdatableSearcher searcher = StreamingKMeansUtilsMR.searcherFromConfiguration(conf); int numClusters = conf.getInt(StreamingKMeansDriver.ESTIMATED_NUM_MAP_CLUSTERS, 1); double estimateDistanceCutoff = conf.getFloat(StreamingKMeansDriver.ESTIMATED_DISTANCE_CUTOFF, StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF); Iterator<Centroid> dataPointsIterator = dataPoints.iterator(); if (estimateDistanceCutoff == StreamingKMeansDriver.INVALID_DISTANCE_CUTOFF) { List<Centroid> estimatePoints = Lists.newArrayListWithExpectedSize(NUM_ESTIMATE_POINTS); while (dataPointsIterator.hasNext() && estimatePoints.size() < NUM_ESTIMATE_POINTS) { Centroid centroid = dataPointsIterator.next(); estimatePoints.add(centroid); } if (log.isInfoEnabled()) { log.info("Estimated Points: {}", estimatePoints.size()); } estimateDistanceCutoff = ClusteringUtils.estimateDistanceCutoff(estimatePoints, searcher.getDistanceMeasure()); } StreamingKMeans streamingKMeans = new StreamingKMeans(searcher, numClusters, estimateDistanceCutoff); // datapointsIterator could be empty if no estimate distance was initially provided // hence creating the iterator again here for the clustering if (!dataPointsIterator.hasNext()) { dataPointsIterator = dataPoints.iterator(); } while (dataPointsIterator.hasNext()) { streamingKMeans.cluster(dataPointsIterator.next()); } streamingKMeans.reindexCentroids(); return streamingKMeans; }
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); for (WeightedVector row : datapoints) { deltaX += distanceMeasure.distance(row, center);
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); for (WeightedVector row : datapoints) { deltaX += distanceMeasure.distance(row, center);
DistanceMeasure distanceMeasure = centroids.getDistanceMeasure(); for (WeightedVector row : datapoints) { deltaX += distanceMeasure.distance(row, center);
@Test public void testAverageDistanceCutoff() { double avgDistanceCutoff = 0; double avgNumClusters = 0; int numTests = 1; System.out.printf("Distance cutoff for %s\n", searcher.getClass().getName()); for (int i = 0; i < numTests; ++i) { searcher.clear(); int numStreamingClusters = (int)Math.log(syntheticData.getFirst().size()) * (1 << NUM_DIMENSIONS); double distanceCutoff = 1.0e-6; double estimatedCutoff = ClusteringUtils.estimateDistanceCutoff(syntheticData.getFirst(), searcher.getDistanceMeasure(), 100); System.out.printf("[%d] Generated synthetic data [magic] %f [estimate] %f\n", i, distanceCutoff, estimatedCutoff); StreamingKMeans clusterer = new StreamingKMeans(searcher, numStreamingClusters, estimatedCutoff); clusterer.cluster(syntheticData.getFirst()); avgDistanceCutoff += clusterer.getDistanceCutoff(); avgNumClusters += clusterer.getNumClusters(); System.out.printf("[%d] %f\n", i, clusterer.getDistanceCutoff()); } avgDistanceCutoff /= numTests; avgNumClusters /= numTests; System.out.printf("Final: distanceCutoff: %f estNumClusters: %f\n", avgDistanceCutoff, avgNumClusters); }
System.out.printf("k log n = %d\n", numStreamingClusters); double estimatedCutoff = ClusteringUtils.estimateDistanceCutoff(syntheticData.getFirst(), searcher.getDistanceMeasure(), 100); StreamingKMeans clusterer = new StreamingKMeans(searcher, numStreamingClusters, estimatedCutoff); System.out.printf("%s %s\n", searcher.getClass().getName(), searcher.getDistanceMeasure() .getClass().getName()); System.out.printf("Total number of clusters %d\n", clusterer.getNumClusters());