private void assertTopLevelCluster(Entry<String,Path> cluster) { String clusterId = cluster.getKey(); Path clusterPath = cluster.getValue(); try { if ("0".equals(clusterId)) { assertPointsInFirstTopLevelCluster(clusterPath); } else if ("1".equals(clusterId)) { assertPointsInSecondTopLevelCluster(clusterPath); } } catch (IOException e) { Assert.fail("Exception occurred while asserting top level cluster."); } }
/** * Story: User wants to use cluster post processor after canopy clustering and then run clustering on the * output clusters */ @Test public void testTopDownClustering() throws Exception { List<VectorWritable> points = getPointsWritable(REFERENCE); Path pointsPath = getTestTempDirPath("points"); conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf); outputPath = getTestTempDirPath("output"); topLevelClustering(pointsPath, conf); Map<String,Path> postProcessedClusterDirectories = ouputPostProcessing(conf); assertPostProcessedOutput(postProcessedClusterDirectories); bottomLevelClustering(postProcessedClusterDirectories); }
private void assertPostProcessedOutput(Map<String,Path> postProcessedClusterDirectories) { for (Entry<String,Path> cluster : postProcessedClusterDirectories.entrySet()) { assertTopLevelCluster(cluster); } }
@Override @Before public void setUp() throws Exception { super.setUp(); Configuration conf = getConfiguration(); fs = FileSystem.get(conf); }
private void bottomLevelClustering(Map<String,Path> postProcessedClusterDirectories) throws IOException, InterruptedException, ClassNotFoundException { for (Entry<String,Path> topLevelCluster : postProcessedClusterDirectories.entrySet()) { String clusterId = topLevelCluster.getKey(); Path topLevelclusterPath = topLevelCluster.getValue(); Path bottomLevelCluster = PathDirectory.getBottomLevelClusterPath(outputPath, clusterId); CanopyDriver.run(conf, topLevelclusterPath, bottomLevelCluster, new ManhattanDistanceMeasure(), 2.1, 2.0, true, 0.0, true); assertBottomLevelCluster(bottomLevelCluster); } }
private void assertBottomLevelCluster(Path bottomLevelCluster) { Path clusteredPointsPath = new Path(bottomLevelCluster, "clusteredPoints"); DummyOutputCollector<IntWritable,WeightedVectorWritable> collector = new DummyOutputCollector<IntWritable,WeightedVectorWritable>(); // The key is the clusterId, the value is the weighted vector for (Pair<IntWritable,WeightedVectorWritable> record : new SequenceFileIterable<IntWritable,WeightedVectorWritable>(new Path(clusteredPointsPath, "part-m-0"), conf)) { collector.collect(record.getFirst(), record.getSecond()); } int clusterSize = collector.getKeys().size(); // First top level cluster produces two more clusters, second top level cluster is not broken again assertTrue(clusterSize == 1 || clusterSize == 2); }