/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a EuclideanDistanceMeasure. */ @Test public void testClusteringEuclideanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job using the run() command. Others can use runJob(). Path output = getTestTempDirPath("output"); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) }; ToolRunner.run(getConfiguration(), new CanopyDriver(), args); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); assertEquals("number of points", points.size(), count); }
/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a EuclideanDistanceMeasure and outlier removal threshold. */ @Test public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job using the run() command. Others can use runJob(). Path output = getTestTempDirPath("output"); String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.7", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION) }; ToolRunner.run(getConfiguration(), new CanopyDriver(), args); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); int expectedPointsAfterOutlierRemoval = 8; assertEquals("number of points", expectedPointsAfterOutlierRemoval, count); }
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.5", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD }; ToolRunner.run(config, new CanopyDriver(), args);
String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION), getTestTempDirPath("testdata").toString(), optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(), optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION), EuclideanDistanceMeasure.class.getName(), optKey(DefaultOptionCreator.T1_OPTION), "3.1", optKey(DefaultOptionCreator.T2_OPTION), "2.1", optKey(DefaultOptionCreator.CLUSTERING_OPTION), optKey(DefaultOptionCreator.OVERWRITE_OPTION), optKey(DefaultOptionCreator.METHOD_OPTION), DefaultOptionCreator.SEQUENTIAL_METHOD }; ToolRunner.run(config, new CanopyDriver(), args);