/** * Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after * their clusterId. * * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint : The * path of the directory containing clusters-*-final and clusteredPoints. * @param output The post processed data would be stored at this path. */ private static void postProcessSeq(Path input, Path output) throws IOException { ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output, new Configuration()); clusterOutputPostProcessor.process(); }
/** * Post processes the output of clustering algorithms and groups them into respective clusters. Each * cluster's vectors are written into a directory named after its clusterId. * * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint: The * path of the directory containing clusters-*-final and clusteredPoints. * @param output The post processed data would be stored at this path. * @param runSequential If set to true, post processes it sequentially, else, uses. MapReduce. Hint: If the clustering * was done sequentially, make it sequential, else vice versa. */ public static void run(Path input, Path output, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { postProcessSeq(input, output); } else { Configuration conf = new Configuration(); postProcessMR(conf, input, output); movePartFilesToRespectiveDirectories(conf, output); } }
private Map<String,Path> ouputPostProcessing(Configuration conf) throws IOException { ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(outputPath, outputPath, conf); clusterOutputPostProcessor.process(); return clusterOutputPostProcessor.getPostProcessedClusterDirectories(); }
/** * Story: User wants to use cluster post processor after canopy clustering and then run clustering on the * output clusters */ @Test public void testGetNumberOfClusters() throws Exception { List<VectorWritable> points = getPointsWritable(REFERENCE); Path pointsPath = getTestTempDirPath("points"); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, new Path(pointsPath, "file2"), fs, conf); outputPathForCanopy = getTestTempDirPath("canopy"); outputPathForKMeans = getTestTempDirPath("kmeans"); topLevelClustering(pointsPath, conf); int numberOfClusters = ClusterCountReader.getNumberOfClusters(outputPathForKMeans, conf); Assert.assertEquals(2, numberOfClusters); verifyThatNumberOfClustersIsCorrect(conf, new Path(outputPathForKMeans, new Path("clusteredPoints"))); }
/** * Finds out the cluster directory of the vector and writes it into the specified cluster. */ private void putVectorInRespectiveCluster(String clusterId, WeightedVectorWritable point) throws IOException { Writer writer = findWriterForVector(clusterId); postProcessedClusterDirectories.put(clusterId, PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId)); writeVectorToCluster(writer, point); }
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args); }
@Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); Path clusterOutputPath = new Path(conf.get("clusterOutputPath")); //we want to the key to be the index, the value to be the cluster id reverseClusterMappings = ClusterCountReader.getClusterIDs(clusterOutputPath, conf, false); }
private void assertPostProcessedOutput(Map<String,Path> postProcessedClusterDirectories) { for (Entry<String,Path> cluster : postProcessedClusterDirectories.entrySet()) { assertTopLevelCluster(cluster); } }
@Override @Before public void setUp() throws Exception { super.setUp(); Configuration conf = getConfiguration(); fs = FileSystem.get(conf); }
@Override @Before public void setUp() throws Exception { super.setUp(); Configuration conf = getConfiguration(); fs = FileSystem.get(conf); }
private void assertPointsInFirstTopLevelCluster(Path clusterPath) throws IOException { List<Vector> vectorsInCluster = getVectorsInCluster(clusterPath); for (Vector vector : vectorsInCluster) { Assert.assertTrue(ArrayUtils.contains(new String[] {"{0:1.0,1:1.0}", "{0:2.0,1:1.0}", "{0:1.0,1:2.0}"}, vector.asFormatString())); } }
/** * Post processes the output of clustering algorithms and groups them into respective clusters. Each * cluster's vectors are written into a directory named after its clusterId. * * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint: The * path of the directory containing clusters-*-final and clusteredPoints. * @param output The post processed data would be stored at this path. * @param runSequential If set to true, post processes it sequentially, else, uses. MapReduce. Hint: If the clustering * was done sequentially, make it sequential, else vice versa. */ public static void run(Path input, Path output, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { postProcessSeq(input, output); } else { Configuration conf = new Configuration(); postProcessMR(conf, input, output); movePartFilesToRespectiveDirectories(conf, output); } }
/** * Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after * their clusterId. * * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint : The * path of the directory containing clusters-*-final and clusteredPoints. * @param output The post processed data would be stored at this path. */ private static void postProcessSeq(Path input, Path output) throws IOException { ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output, new Configuration()); clusterOutputPostProcessor.process(); }
/** * Finds out the cluster directory of the vector and writes it into the specified cluster. */ private void putVectorInRespectiveCluster(String clusterId, WeightedVectorWritable point) throws IOException { Writer writer = findWriterForVector(clusterId); postProcessedClusterDirectories.put(clusterId, PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId)); writeVectorToCluster(writer, point); }
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args); }
@Override public void setup(Context context) throws IOException { Configuration conf = context.getConfiguration(); Path clusterOutputPath = new Path(conf.get("clusterOutputPath")); //we want to the key to be the index, the value to be the cluster id reverseClusterMappings = ClusterCountReader.getClusterIDs(clusterOutputPath, conf, false); }
/** * Post processes the output of clustering algorithms and groups them into respective clusters. Each * cluster's vectors are written into a directory named after its clusterId. * * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint: The * path of the directory containing clusters-*-final and clusteredPoints. * @param output The post processed data would be stored at this path. * @param runSequential If set to true, post processes it sequentially, else, uses. MapReduce. Hint: If the clustering * was done sequentially, make it sequential, else vice versa. */ public static void run(Path input, Path output, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (runSequential) { postProcessSeq(input, output); } else { Configuration conf = new Configuration(); postProcessMR(conf, input, output); movePartFilesToRespectiveDirectories(conf, output); } }
/** * Process Sequentially. Reads the vectors one by one, and puts them into respective directory, named after * their clusterId. * * @param input The output path provided to the clustering algorithm, whose would be post processed. Hint : The * path of the directory containing clusters-*-final and clusteredPoints. * @param output The post processed data would be stored at this path. */ private static void postProcessSeq(Path input, Path output) throws IOException { ClusterOutputPostProcessor clusterOutputPostProcessor = new ClusterOutputPostProcessor(input, output, new Configuration()); clusterOutputPostProcessor.process(); }
/** * Finds out the cluster directory of the vector and writes it into the specified cluster. */ private void putVectorInRespectiveCluster(String clusterId, WeightedVectorWritable point) throws IOException { Writer writer = findWriterForVector(clusterId); postProcessedClusterDirectories.put(clusterId, PathDirectory.getClusterPathForClusterId(clusterPostProcessorOutput, clusterId)); writeVectorToCluster(writer, point); }
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new ClusterOutputPostProcessorDriver(), args); }