org.apache.mahout.clustering.canopy.TestCanopyCreation java code examples

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure and outlier removal threshold.
 */
@Test
public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.7",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 int expectedPointsAfterOutlierRemoval = 8;
 assertEquals("number of points", expectedPointsAfterOutlierRemoval, count);
}

/**
 * Story: User can cluster points using a EuclideanDistanceMeasure and a
 * reference implementation
 */
@Test
public void testReferenceEuclidean() throws Exception {
 // see setUp for cluster creation
 printCanopies(referenceEuclidean);
 assertEquals("number of canopies", 3, referenceEuclidean.size());
 int[] expectedNumPoints = { 5, 5, 3 };
 double[][] expectedCentroids = { { 1.8, 1.8 }, { 4.2, 4.2 },
   { 4.666666666666667, 4.666666666666667 } };
 for (int canopyIx = 0; canopyIx < referenceEuclidean.size(); canopyIx++) {
  Canopy testCanopy = referenceEuclidean.get(canopyIx);
  assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
         expectedNumPoints[canopyIx]);
  double[] refCentroid = expectedCentroids[canopyIx];
  Vector testCentroid = testCanopy.computeCentroid();
  for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
   assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
     refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
  }
 }
}

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,
  getTestTempFilePath("testdata/file1"), fs, config);
ClusteringTestUtils.writePointsToFile(points,
  getTestTempFilePath("testdata/file2"), fs, config);
Path output = getTestTempDirPath("output");
CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
  manhattanDistanceMeasure, 3.1, 2.1, false, 0.0, false);
 Writable key = new Text();
 ClusterWritable clusterWritable = new ClusterWritable();
 assertTrue("more to come", reader.next(key, clusterWritable));
 assertEquals("1st key", "C-0", key.toString());
 Pair<Double,Double> c = new Pair<Double,Double>(clusterWritable.getValue() .getCenter().get(0),
 clusterWritable.getValue().getCenter().get(1));
 assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
 assertTrue("more to come", reader.next(key, clusterWritable));
 assertEquals("2nd key", "C-1", key.toString());
 c = new Pair<Double,Double>(clusterWritable.getValue().getCenter().get(0),
   clusterWritable.getValue().getCenter().get(1));
 assertTrue("center " + c + " not found", findAndRemove(c, refCenters, EPSILON));
 assertFalse("more to come", reader.next(key, clusterWritable));
} finally {
 Closeables.close(reader, true);

/**
 * Story: User can specify a clustering limit that prevents output of small
 * clusters
 */
@Test
public void testCanopyMapperClusterFilter() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
   .getClass().getName());
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
 conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
 conf.set(CanopyConfigKeys.CF_KEY, "3");
 DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
 Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter
   .build(mapper, conf, writer);
 mapper.setup(context);
 List<VectorWritable> points = getPointsWritable();
 // map the data
 for (VectorWritable point : points) {
  mapper.map(new Text(), point, context);
 }
 mapper.cleanup(context);
 assertEquals("Number of map results", 1, writer.getData().size());
 // now verify the output
 List<VectorWritable> data = writer.getValue(new Text("centroid"));
 assertEquals("Number of centroids", 2, data.size());
}

@Override
@Before
public void setUp() throws Exception {
 super.setUp();
 fs = FileSystem.get(getConfiguration());
 referenceManhattan = CanopyClusterer.createCanopies(getPoints(),
   manhattanDistanceMeasure, 3.1, 2.1);
 manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan);
 referenceEuclidean = CanopyClusterer.createCanopies(getPoints(),
   euclideanDistanceMeasure, 3.1, 2.1);
 euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean);
}

/**
 * Story: User can set T3 and T4 values to be used by the reducer for its T1
 * and T2 thresholds
 */
@Test
public void testCanopyReducerT3T4Configuration() throws Exception {
 CanopyReducer reducer = new CanopyReducer();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
   "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
 conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
 conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1));
 conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1));
 conf.set(CanopyConfigKeys.CF_KEY, "0");
 DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>();
 Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter
   .build(reducer, conf, writer, Text.class, VectorWritable.class);
 reducer.setup(context);
 assertEquals(1.1, reducer.getCanopyClusterer().getT1(), EPSILON);
 assertEquals(0.1, reducer.getCanopyClusterer().getT2(), EPSILON);
}

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,
  getTestTempFilePath("testdata/file1"), fs, config);
ClusteringTestUtils.writePointsToFile(points,
  getTestTempFilePath("testdata/file2"), fs, config);
Path output = getTestTempDirPath("output");
CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
  euclideanDistanceMeasure, 3.1, 2.1, false, 0.0, false);
 Writable key = new Text();
 ClusterWritable clusterWritable = new ClusterWritable();
 assertTrue("more to come", reader.next(key, clusterWritable));
 assertEquals("1st key", "C-0", key.toString());
 Pair<Double,Double> c = new Pair<Double,Double>(clusterWritable.getValue().getCenter().get(0),
                         clusterWritable.getValue().getCenter().get(1));
 assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
 assertTrue("more to come", reader.next(key, clusterWritable));
 assertEquals("2nd key", "C-1", key.toString());
 c = new Pair<Double,Double>(clusterWritable.getValue().getCenter().get(0),
               clusterWritable.getValue().getCenter().get(1));
 assertTrue("center "+c+" not found", findAndRemove(c, refCenters, EPSILON));
 assertFalse("more to come", reader.next(key, clusterWritable));
} finally {
 Closeables.close(reader, true);

 /**
  * Story: User can specify a cluster filter that limits the minimum size of
  * canopies produced by the reducer
  */
 @Test
 public void testCanopyReducerClusterFilter() throws Exception {
  CanopyReducer reducer = new CanopyReducer();
  Configuration conf = getConfiguration();
  conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
    "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
  conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
  conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
  conf.set(CanopyConfigKeys.CF_KEY, "3");
  DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>();
  Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter
    .build(reducer, conf, writer, Text.class, VectorWritable.class);
  reducer.setup(context);

  List<VectorWritable> points = getPointsWritable();
  reducer.reduce(new Text("centroid"), points, context);
  Set<Text> keys = writer.getKeys();
  assertEquals("Number of centroids", 2, keys.size());
 }
}

/**
 * Story: User can cluster points using a ManhattanDistanceMeasure and a
 * reference implementation
 */
@Test
public void testReferenceManhattan() throws Exception {
 // see setUp for cluster creation
 printCanopies(referenceManhattan);
 assertEquals("number of canopies", 3, referenceManhattan.size());
 for (int canopyIx = 0; canopyIx < referenceManhattan.size(); canopyIx++) {
  Canopy testCanopy = referenceManhattan.get(canopyIx);
  int[] expectedNumPoints = { 4, 4, 3 };
  double[][] expectedCentroids = { { 1.5, 1.5 }, { 4.0, 4.0 },
    { 4.666666666666667, 4.6666666666666667 } };
  assertEquals("canopy points " + canopyIx, testCanopy.getNumObservations(),
         expectedNumPoints[canopyIx]);
  double[] refCentroid = expectedCentroids[canopyIx];
  Vector testCentroid = testCanopy.computeCentroid();
  for (int pointIx = 0; pointIx < refCentroid.length; pointIx++) {
   assertEquals("canopy centroid " + canopyIx + '[' + pointIx + ']',
     refCentroid[pointIx], testCentroid.get(pointIx), EPSILON);
  }
 }
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure.
 */
@Test
public void testClusteringEuclideanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

public void testCanopyMapperManhattan() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
   .getClass().getName());
 mapper.setup(context);
 List<VectorWritable> points = getPointsWritable();
 assertEquals("Number of map results", 1, writer.getData().size());
 assertEquals("Number of centroids", 3, data.size());
 for (int i = 0; i < data.size(); i++) {
  assertEquals("Centroid error",
    manhattanCentroids.get(i).asFormatString(), data.get(i).get()
      .asFormatString());

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a ManhattanDistanceMeasure.
 */
@Test
public void testClusteringManhattanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

public void testCanopyMapperEuclidean() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure
   .getClass().getName());
 mapper.setup(context);
 List<VectorWritable> points = getPointsWritable();
 assertEquals("Number of map results", 1, writer.getData().size());
 assertEquals("Number of centroids", 3, data.size());
 for (int i = 0; i < data.size(); i++) {
  assertEquals("Centroid error",
    euclideanCentroids.get(i).asFormatString(), data.get(i).get()
      .asFormatString());

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,
  getTestTempFilePath("testdata/file1"), fs, config);
Path output = getTestTempDirPath("output");
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
  getTestTempDirPath("testdata").toString(),
  optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
  optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
  EuclideanDistanceMeasure.class.getName(),
  optKey(DefaultOptionCreator.T1_OPTION), "3.1",
  optKey(DefaultOptionCreator.T2_OPTION), "2.1",
  optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.5",
  optKey(DefaultOptionCreator.CLUSTERING_OPTION),
  optKey(DefaultOptionCreator.OVERWRITE_OPTION),
  optKey(DefaultOptionCreator.METHOD_OPTION),
  DefaultOptionCreator.SEQUENTIAL_METHOD };
ToolRunner.run(config, new CanopyDriver(), args);
for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
  config)) {
 assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), clusterWritable.getValue()
   .getCenter());
 ix++;
long count = HadoopUtil.countRecords(path, config);
int expectedPointsHavingPDFGreaterThanThreshold = 6;
assertEquals("number of points", expectedPointsHavingPDFGreaterThanThreshold, count);

public void testCanopyReducerManhattan() throws Exception {
 CanopyReducer reducer = new CanopyReducer();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
   "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
 reducer.setup(context);
 List<VectorWritable> points = getPointsWritable();
 reducer.reduce(new Text("centroid"), points, context);
 Iterable<Text> keys = writer.getKeysInInsertionOrder();
 assertEquals("Number of centroids", 3, Iterables.size(keys));
 int i = 0;
 for (Text key : keys) {
  ClusterWritable clusterWritable = data.get(0);
  Canopy canopy = (Canopy) clusterWritable.getValue();
  assertEquals(manhattanCentroids.get(i).asFormatString() + " is not equal to "
    + canopy.computeCentroid().asFormatString(),
    manhattanCentroids.get(i), canopy.computeCentroid());

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,
  getTestTempFilePath("testdata/file1"), fs, config);
Path output = getTestTempDirPath("output");
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
  getTestTempDirPath("testdata").toString(),
  optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
  optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
  EuclideanDistanceMeasure.class.getName(),
  optKey(DefaultOptionCreator.T1_OPTION), "3.1",
  optKey(DefaultOptionCreator.T2_OPTION), "2.1",
  optKey(DefaultOptionCreator.CLUSTERING_OPTION),
  optKey(DefaultOptionCreator.OVERWRITE_OPTION),
  optKey(DefaultOptionCreator.METHOD_OPTION),
  DefaultOptionCreator.SEQUENTIAL_METHOD };
ToolRunner.run(config, new CanopyDriver(), args);
for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
  config)) {
 assertEquals("Center [" + ix + ']', euclideanCentroids.get(ix), clusterWritable.getValue()
   .getCenter());
 ix++;
assertEquals("number of points", points.size(), count);

public void testCanopyReducerEuclidean() throws Exception {
 CanopyReducer reducer = new CanopyReducer();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
 reducer.setup(context);
 List<VectorWritable> points = getPointsWritable();
 reducer.reduce(new Text("centroid"), points, context);
 Iterable<Text> keys = writer.getKeysInInsertionOrder();
 assertEquals("Number of centroids", 3, Iterables.size(keys));
 int i = 0;
 for (Text key : keys) {
  ClusterWritable clusterWritable = data.get(0);
  Canopy canopy = (Canopy) clusterWritable.getValue();
  assertEquals(euclideanCentroids.get(i).asFormatString() + " is not equal to "
    + canopy.computeCentroid().asFormatString(),
    euclideanCentroids.get(i), canopy.computeCentroid());

/** Story: User can cluster points using sequential execution */
@Test
public void testClusteringManhattanSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);
 // now run the Canopy Driver in sequential mode
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true);
 // verify output from sequence file
 Path path = new Path(output, "clusters-0-final/part-r-00000");
 int ix = 0;
 for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
   config)) {
  assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue()
    .getCenter());
  ix++;
 }
 path = new Path(output, "clusteredPoints/part-m-0");
 long count = HadoopUtil.countRecords(path, config);
 assertEquals("number of points", points.size(), count);
}

Most used methods

assertEquals
assertFalse
assertTrue
findAndRemove
getConfiguration
getPoints
getPointsWritable
getTestTempDirPath
getTestTempFilePath
optKey
printCanopies
Print the canopies to the transcript

printCanopies

Popular in Java

Running tasks concurrently on multiple threads
addToBackStack (FragmentTransaction)
getExternalFilesDir (Context)
requestLocationUpdates (LocationManager)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Date (java.sql)
A class which can consume and produce dates in SQL Date format. Dates are represented in SQL as yyyy
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Kernel (java.awt.image)
Best IntelliJ plugins

How to useTestCanopyCreation in org.apache.mahout.clustering.canopy

Best Java code snippets using org.apache.mahout.clustering.canopy.TestCanopyCreation (Showing top 18 results out of 315)

How to use
TestCanopyCreation
in
org.apache.mahout.clustering.canopy