org.apache.mahout.math.neighborhood.UpdatableSearcher java code examples

 public DistanceMeasure getDistanceMeasure() {
  return centroids.getDistanceMeasure();
 }
}

/**
 * @return the number of clusters computed from the points until now.
 */
public int getNumClusters() {
 return centroids.size();
}

/**
 * @return an Iterator to the Centroids contained in this clusterer.
 */
@Override
public Iterator<Centroid> iterator() {
 return Iterators.transform(centroids.iterator(), new Function<Vector, Centroid>() {
  @Override
  public Centroid apply(Vector input) {
   return (Centroid)input;
  }
 });
}

 centroids.clear();
 numProcessedDatapoints = 0;
if (centroids.size() == 0) {
 centroids.add(datapointsIterator.next().clone());
 ++numProcessedDatapoints;
 WeightedThing<Vector> closestPair = centroids.searchFirst(row, false);
 if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) {
  centroids.add(row.clone());
 } else {
  if (!centroids.remove(centroid, Constants.EPSILON)) {
   throw new RuntimeException("Unable to remove centroid");
  centroids.add(centroid);
 if (!collapseClusters && centroids.size() > clusterOvershoot * numClusters) {
  numClusters = (int) Math.max(numClusters, clusterLogFactor * Math.log(numProcessedDatapoints));
  if (centroids.size() > numClusters) {
   distanceCutoff *= beta;

DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
 for (Vector center : centroids) {
  Vector closestOtherCluster = centroids.searchFirst(center, true).getValue();
  closestClusterDistances.add(distanceMeasure.distance(center, closestOtherCluster));
  WeightedVector datapoint = datapoints.get(j);
  WeightedThing<Vector> closestPair = centroids.searchFirst(datapoint, false);
  int closestIndex = ((WeightedVector) closestPair.getValue()).getIndex();
  double closestDistance = closestPair.getWeight();
 centroids.clear();
 centroids.addAll(newCentroids);
  Centroid closestCentroid = (Centroid) centroids.searchFirst(datapoint, false).getValue();
  closestCentroid.setWeight(closestCentroid.getWeight() + datapoint.getWeight());

 @Test
 public void testRemove() {
  searcher.clear();
  for (int i = 0; i < dataPoints.rowSize(); ++i) {
   Vector datapoint = dataPoints.viewRow(i);
   searcher.add(datapoint);
   // As long as points are not searched for right after being added, in FastProjectionSearch, points are not
   // merged with the main list right away, so if a search for a point occurs before it's merged the pendingAdditions
   // list also needs to be looked at.
   // This used to not be the case for searchFirst(), thereby causing removal failures.
   if (i % 2 == 0) {
    assertTrue("Failed to find self [search]",
      searcher.search(datapoint, 1).get(0).getWeight() < Constants.EPSILON);
    assertTrue("Failed to find self [searchFirst]",
      searcher.searchFirst(datapoint, false).getWeight() < Constants.EPSILON);
    assertTrue("Failed to remove self", searcher.remove(datapoint, Constants.EPSILON));
   }
  }
 }
}

@Test
public void testRemoval() {
 searcher.clear();
 searcher.addAllMatrixSlices(dataPoints);
  int size0 = searcher.size();
  List<WeightedThing<Vector>> r0 = searcher.search(x.get(0), 2);
  searcher.remove(x.get(0), 1.0e-7);
  assertEquals(size0 - 1, searcher.size());
  List<WeightedThing<Vector>> r = searcher.search(x.get(0), 1);
  assertTrue("Vector should be gone", r.get(0).getWeight() > 0);
  assertEquals("Previous second neighbor should be first", 0,
    r.get(0).getValue().minus(r0.get(1).getValue()).norm (1), 1.0e-8);
  searcher.remove(x.get(1), 1.0e-7);
  assertEquals(size0 - 2, searcher.size());
  r = searcher.search(x.get(1), 1);
  assertTrue("Vector should be gone", r.get(0).getWeight() > 0);
  try {
   List<Vector> x = Lists.newArrayList(Iterables.limit(searcher, 2));
   searcher.remove(x.get(0), 1.0e-7);
   fail("Shouldn't be able to delete from " + searcher.getClass().getName());
  } catch (UnsupportedOperationException e) {

@Test
public void testSearchFirst() {
 searcher.clear();
 searcher.addAll(dataPoints);
 for (Vector datapoint : dataPoints) {
  WeightedThing<Vector> first = searcher.searchFirst(datapoint, false);
  WeightedThing<Vector> second = searcher.searchFirst(datapoint, true);
  List<WeightedThing<Vector>> firstTwo = searcher.search(datapoint, 2);
  assertEquals("First isn't self", 0, first.getWeight(), 0);
  assertEquals("First isn't self", datapoint, first.getValue());
  assertEquals("First doesn't match", first, firstTwo.get(0));
  assertEquals("Second doesn't match", second, firstTwo.get(1));
 }
}

/**
 * Computes the summaries for the distances in each cluster.
 * @param datapoints iterable of datapoints.
 * @param centroids iterable of Centroids.
 * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose
 * index is i.
 */
public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints,
                                Iterable<? extends Vector> centroids,
                                DistanceMeasure distanceMeasure) {
 UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
 searcher.addAll(centroids);
 List<OnlineSummarizer> summarizers = Lists.newArrayList();
 if (searcher.size() == 0) {
  return summarizers;
 }
 for (int i = 0; i < searcher.size(); ++i) {
  summarizers.add(new OnlineSummarizer());
 }
 for (Vector v : datapoints) {
  Centroid closest = (Centroid)searcher.search(v,  1).get(0).getValue();
  OnlineSummarizer summarizer = summarizers.get(closest.getIndex());
  summarizer.add(distanceMeasure.distance(v, closest));
 }
 return summarizers;
}

double bestCost = Double.POSITIVE_INFINITY;
for (int i = 0; i < numRuns; ++i) {
 centroids.clear();
 if (kMeansPlusPlusInit) {
 centroids.clear();
 centroids.addAll(bestCentroids);
  WeightedVector closest = (WeightedVector) centroids.searchFirst(testDatapoint, false).getValue();
  closest.setWeight(closest.getWeight() + testDatapoint.getWeight());

DistanceMeasure distanceMeasure = centroids.getDistanceMeasure();
for (WeightedVector row : datapoints) {
 deltaX += distanceMeasure.distance(row, center);
centroids.add(c_1);
int clusterIndex = 1;
while (centroids.size() < numClusters) {
 centroids.add(nextSeed);

@Test
public void testSearchLimiting() {
 searcher.clear();
 searcher.addAll(dataPoints);
 for (Vector datapoint : dataPoints) {
  List<WeightedThing<Vector>> firstTwo = searcher.search(datapoint, 2);
  assertThat("Search limit isn't respected", firstTwo.size(), is(lessThanOrEqualTo(2)));
 }
}

@Test
public void testClustering() {
 searcher.clear();
 int numStreamingClusters = (int)Math.log(syntheticData.getFirst().size()) * (1 << NUM_DIMENSIONS);
 System.out.printf("k log n = %d\n", numStreamingClusters);
 double estimatedCutoff = ClusteringUtils.estimateDistanceCutoff(syntheticData.getFirst(),
   searcher.getDistanceMeasure(), 100);
 StreamingKMeans clusterer =
   new StreamingKMeans(searcher, numStreamingClusters, estimatedCutoff);
 System.out.printf("%s %s\n", searcher.getClass().getName(), searcher.getDistanceMeasure()
   .getClass().getName());
 System.out.printf("Total number of clusters %d\n", clusterer.getNumClusters());
  WeightedThing<Vector> v = searcher.search(mean, 1).get(0);
  maxWeight = Math.max(v.getWeight(), maxWeight);

@Test
public void testOrdering() {
 searcher.clear();
 Matrix queries = new DenseMatrix(100, 20);
 MultiNormal gen = new MultiNormal(20);
 for (int i = 0; i < 100; i++) {
  queries.viewRow(i).assign(gen.sample());
 }
 searcher.addAllMatrixSlices(dataPoints);
 for (MatrixSlice query : queries) {
  List<WeightedThing<Vector>> r = searcher.search(query.vector(), 200);
  double x = 0;
  for (WeightedThing<Vector> thing : r) {
   assertTrue("Scores must be monotonic increasing", thing.getWeight() >= x);
   x = thing.getWeight();
  }
 }
}

@Test
public void testNearMatch() {
 searcher.clear();
 List<MatrixSlice> queries = Lists.newArrayList(Iterables.limit(dataPoints, 100));
 searcher.addAllMatrixSlicesAsWeightedVectors(dataPoints);
 MultiNormal noise = new MultiNormal(0.01, new DenseVector(20));
 for (MatrixSlice slice : queries) {
  Vector query = slice.vector();
  final Vector epsilon = noise.sample();
  List<WeightedThing<Vector>> r = searcher.search(query, 2);
  query = query.plus(epsilon);
  assertEquals("Distance has to be small", epsilon.norm(2), r.get(0).getWeight(), 1.0e-1);
  assertEquals("Answer must be substantially the same as query", epsilon.norm(2),
    r.get(0).getValue().minus(query).norm(2), 1.0e-1);
  assertTrue("Wrong answer must be further away", r.get(1).getWeight() > r.get(0).getWeight());
 }
}

@Test
public void testAverageDistanceCutoff() {
 double avgDistanceCutoff = 0;
 double avgNumClusters = 0;
 int numTests = 1;
 System.out.printf("Distance cutoff for %s\n", searcher.getClass().getName());
 for (int i = 0; i < numTests; ++i) {
  searcher.clear();
  int numStreamingClusters = (int)Math.log(syntheticData.getFirst().size()) * (1 <<
    NUM_DIMENSIONS);
  double distanceCutoff = 1.0e-6;
  double estimatedCutoff = ClusteringUtils.estimateDistanceCutoff(syntheticData.getFirst(),
    searcher.getDistanceMeasure(), 100);
  System.out.printf("[%d] Generated synthetic data [magic] %f [estimate] %f\n", i, distanceCutoff, estimatedCutoff);
  StreamingKMeans clusterer =
    new StreamingKMeans(searcher, numStreamingClusters, estimatedCutoff);
  clusterer.cluster(syntheticData.getFirst());
  avgDistanceCutoff += clusterer.getDistanceCutoff();
  avgNumClusters += clusterer.getNumClusters();
  System.out.printf("[%d] %f\n", i, clusterer.getDistanceCutoff());
 }
 avgDistanceCutoff /= numTests;
 avgNumClusters /= numTests;
 System.out.printf("Final: distanceCutoff: %f estNumClusters: %f\n", avgDistanceCutoff, avgNumClusters);
}

row.assign(Functions.mult(1 / scale));
WeightedThing<Vector> cluster = clustering.search(row, 1).get(0);
current.assign(cluster.getValue());
current.assign(Functions.mult(scale));

/**
 * Adds up the distances from each point to its closest cluster and returns the sum.
 * @param datapoints iterable of datapoints.
 * @param centroids iterable of Centroids.
 * @return the total cost described above.
 */
public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) {
 DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure();
 UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1);
 searcher.addAll(centroids);
 return totalClusterCost(datapoints, searcher);
}

/**
 * Selects some of the original points randomly with probability proportional to their weights. This is much
 * less sophisticated than the kmeans++ approach, however it is faster and coupled with
 *
 * The side effect of this method is to fill the centroids structure itself.
 *
 * @param datapoints The datapoints to select from.  These datapoints should be WeightedVectors of some kind.
 */
private void initializeSeedsRandomly(List<? extends WeightedVector> datapoints) {
 int numDatapoints = datapoints.size();
 double totalWeight = 0;
 for (WeightedVector datapoint : datapoints) {
  totalWeight += datapoint.getWeight();
 }
 Multinomial<Integer> seedSelector = new Multinomial<Integer>();
 for (int i = 0; i < numDatapoints; ++i) {
  seedSelector.add(i, datapoints.get(i).getWeight() / totalWeight);
 }
 for (int i = 0; i < numClusters; ++i) {
  int sample = seedSelector.sample();
  seedSelector.delete(sample);
  Centroid centroid = new Centroid(datapoints.get(sample));
  centroid.setIndex(i);
  centroids.add(centroid);
 }
}

 centroids.clear();
 numProcessedDatapoints = 0;
if (centroids.size() == 0) {
 centroids.add(datapointsIterator.next().clone());
 ++numProcessedDatapoints;
 WeightedThing<Vector> closestPair = centroids.searchFirst(row, false);
 if (sample < row.getWeight() * closestPair.getWeight() / distanceCutoff) {
  centroids.add(row.clone());
 } else {
  if (!centroids.remove(centroid, Constants.EPSILON)) {
   throw new RuntimeException("Unable to remove centroid");
  centroids.add(centroid);
 if (!collapseClusters && centroids.size() > clusterOvershoot * numClusters) {
  numClusters = (int) Math.max(numClusters, clusterLogFactor * Math.log(numProcessedDatapoints));
  if (centroids.size() > numClusters) {
   distanceCutoff *= beta;

Javadoc

Describes how we search vectors. A class should extend UpdatableSearch only if it can handle a remove function.

Most used methods

addAllMatrixSlicesAsWeightedVectors

Popular in Java

Start an intent from android
getContentResolver (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
getResourceAsStream (ClassLoader)
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
LinkedHashMap (java.util)
LinkedHashMap is an implementation of Map that guarantees iteration order. All optional operations a
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
Best IntelliJ plugins

How to useUpdatableSearcher in org.apache.mahout.math.neighborhood

Best Java code snippets using org.apache.mahout.math.neighborhood.UpdatableSearcher (Showing top 20 results out of 315)

How to use
UpdatableSearcher
in
org.apache.mahout.math.neighborhood