/** * Story: User can specify a clustering limit that prevents output of small * clusters */ @Test public void testCanopyMapperClusterFilter() throws Exception { CanopyMapper mapper = new CanopyMapper(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure .getClass().getName()); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); conf.set(CanopyConfigKeys.CF_KEY, "3"); DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>(); Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter .build(mapper, conf, writer); mapper.setup(context); List<VectorWritable> points = getPointsWritable(); // map the data for (VectorWritable point : points) { mapper.map(new Text(), point, context); } mapper.cleanup(context); assertEquals("Number of map results", 1, writer.getData().size()); // now verify the output List<VectorWritable> data = writer.getValue(new Text("centroid")); assertEquals("Number of centroids", 2, data.size()); }
/** * Story: User can set T3 and T4 values to be used by the reducer for its T1 * and T2 thresholds */ @Test public void testCanopyReducerT3T4Configuration() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1)); conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1)); conf.set(CanopyConfigKeys.CF_KEY, "0"); DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>(); Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter .build(reducer, conf, writer, Text.class, VectorWritable.class); reducer.setup(context); assertEquals(1.1, reducer.getCanopyClusterer().getT1(), EPSILON); assertEquals(0.1, reducer.getCanopyClusterer().getT2(), EPSILON); }
/** * Story: User can specify a cluster filter that limits the minimum size of * canopies produced by the reducer */ @Test public void testCanopyReducerClusterFilter() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); conf.set(CanopyConfigKeys.CF_KEY, "3"); DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>(); Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter .build(reducer, conf, writer, Text.class, VectorWritable.class); reducer.setup(context); List<VectorWritable> points = getPointsWritable(); reducer.reduce(new Text("centroid"), points, context); Set<Text> keys = writer.getKeys(); assertEquals("Number of centroids", 2, keys.size()); } }
public void testCanopyMapperManhattan() throws Exception { CanopyMapper mapper = new CanopyMapper(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure .getClass().getName());
public void testCanopyMapperEuclidean() throws Exception { CanopyMapper mapper = new CanopyMapper(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure .getClass().getName());
public void testCanopyReducerManhattan() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
public void testCanopyReducerEuclidean() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
@Override @Before public void setUp() throws Exception { super.setUp(); fs = FileSystem.get(getConfiguration()); referenceManhattan = CanopyClusterer.createCanopies(getPoints(), manhattanDistanceMeasure, 3.1, 2.1); manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan); referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), euclideanDistanceMeasure, 3.1, 2.1); euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean); }
/** Story: User can cluster points using sequential execution */ @Test public void testClusteringManhattanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); // now run the Canopy Driver in sequential mode Path output = getTestTempDirPath("output"); CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true); // verify output from sequence file Path path = new Path(output, "clusters-0-final/part-r-00000"); int ix = 0; for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true, config)) { assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue() .getCenter()); ix++; } path = new Path(output, "clusteredPoints/part-m-0"); long count = HadoopUtil.countRecords(path, config); assertEquals("number of points", points.size(), count); }
/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a EuclideanDistanceMeasure. */ @Test public void testClusteringEuclideanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job using the run() command. Others can use runJob(). Path output = getTestTempDirPath("output"); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) }; ToolRunner.run(getConfiguration(), new CanopyDriver(), args); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); assertEquals("number of points", points.size(), count); }
/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a EuclideanDistanceMeasure and outlier removal threshold. */ @Test public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job using the run() command. Others can use runJob(). Path output = getTestTempDirPath("output"); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.7", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) }; ToolRunner.run(getConfiguration(), new CanopyDriver(), args); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); int expectedPointsAfterOutlierRemoval = 8; assertEquals("number of points", expectedPointsAfterOutlierRemoval, count); }
/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a ManhattanDistanceMeasure. */ @Test public void testClusteringManhattanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job Path output = getTestTempDirPath("output"); CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); assertEquals("number of points", points.size(), count); }
public void testClusteringEuclideanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);
public void testClusteringEuclideanWithOutlierRemovalSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);
public void testCanopyGenManhattanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);
public void testCanopyGenEuclideanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config);