@Override protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException, InterruptedException { Iterator<ClusterWritable> iter = values.iterator(); Cluster first = iter.next().getValue(); // there must always be at least one while (iter.hasNext()) { Cluster cluster = iter.next().getValue(); first.observe(cluster); } List<Cluster> models = Lists.newArrayList(); models.add(first); classifier = new ClusterClassifier(models, policy); classifier.close(); context.write(key, new ClusterWritable(first)); }
@Override protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException, InterruptedException { Iterator<ClusterWritable> iter = values.iterator(); Cluster first = iter.next().getValue(); // there must always be at least one while (iter.hasNext()) { Cluster cluster = iter.next().getValue(); first.observe(cluster); } List<Cluster> models = Lists.newArrayList(); models.add(first); classifier = new ClusterClassifier(models, policy); classifier.close(); context.write(key, new ClusterWritable(first)); }
@Override protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException, InterruptedException { Iterator<ClusterWritable> iter = values.iterator(); Cluster first = iter.next().getValue(); // there must always be at least one while (iter.hasNext()) { Cluster cluster = iter.next().getValue(); first.observe(cluster); } List<Cluster> models = new ArrayList<>(); models.add(first); classifier = new ClusterClassifier(models, policy); classifier.close(); context.write(key, new ClusterWritable(first)); }
public void readFromSeqFiles(Configuration conf, Path path) throws IOException { Configuration config = new Configuration(); List<Cluster> clusters = Lists.newArrayList(); for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST, PathFilters.logsCRCFilter(), config)) { Cluster cluster = cw.getValue(); cluster.configure(conf); clusters.add(cluster); } this.models = clusters; modelClass = models.get(0).getClass().getName(); this.policy = readPolicy(path); }
public void readFromSeqFiles(Configuration conf, Path path) throws IOException { Configuration config = new Configuration(); List<Cluster> clusters = Lists.newArrayList(); for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST, PathFilters.logsCRCFilter(), config)) { Cluster cluster = cw.getValue(); cluster.configure(conf); clusters.add(cluster); } this.models = clusters; modelClass = models.get(0).getClass().getName(); this.policy = readPolicy(path); }
public void readFromSeqFiles(Configuration conf, Path path) throws IOException { Configuration config = new Configuration(); List<Cluster> clusters = new ArrayList<>(); for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST, PathFilters.logsCRCFilter(), config)) { Cluster cluster = cw.getValue(); cluster.configure(conf); clusters.add(cluster); } this.models = clusters; modelClass = models.get(0).getClass().getName(); this.policy = readPolicy(path); }
public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusters = new ArrayList<>(); FileSystem fileSystem = clusterOutputPath.getFileSystem(conf); FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter()); Iterator<?> it = new SequenceFileDirValueIterator<>( clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusters.add(cluster); } return clusters; }
public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusters = Lists.newArrayList(); FileSystem fileSystem = clusterOutputPath.getFileSystem(conf); FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter()); Iterator<?> it = new SequenceFileDirValueIterator<Writable>( clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusters.add(cluster); } return clusters; }
public static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusters = Lists.newArrayList(); FileSystem fileSystem = clusterOutputPath.getFileSystem(conf); FileStatus[] clusterFiles = fileSystem.listStatus(clusterOutputPath, PathFilters.finalPartFilter()); Iterator<?> it = new SequenceFileDirValueIterator<Writable>( clusterFiles[0].getPath(), PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusters.add(cluster); } return clusters; }
/** * Populates a list with clusters present in clusters-*-final directory. * * @param clusterOutputPath * The output path of the clustering. * @param conf * The Hadoop Configuration * @return The list of clusters found by the clustering. * @throws IOException */ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusterModels = Lists.newArrayList(); Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); Iterator<?> it = new SequenceFileDirValueIterator<Writable>(finalClustersPath, PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusterModels.add(cluster); } return clusterModels; }
/** * Populates a list with clusters present in clusters-*-final directory. * * @param clusterOutputPath * The output path of the clustering. * @param conf * The Hadoop Configuration * @return The list of clusters found by the clustering. * @throws IOException */ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusterModels = new ArrayList<>(); Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); Iterator<?> it = new SequenceFileDirValueIterator<>(finalClustersPath, PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusterModels.add(cluster); } return clusterModels; }
/** * Populates a list with clusters present in clusters-*-final directory. * * @param clusterOutputPath * The output path of the clustering. * @param conf * The Hadoop Configuration * @return The list of clusters found by the clustering. * @throws IOException */ private static List<Cluster> populateClusterModels(Path clusterOutputPath, Configuration conf) throws IOException { List<Cluster> clusterModels = Lists.newArrayList(); Path finalClustersPath = finalClustersPath(conf, clusterOutputPath); Iterator<?> it = new SequenceFileDirValueIterator<Writable>(finalClustersPath, PathType.LIST, PathFilters.partFilter(), null, false, conf); while (it.hasNext()) { ClusterWritable next = (ClusterWritable) it.next(); Cluster cluster = next.getValue(); cluster.configure(conf); clusterModels.add(cluster); } return clusterModels; }
/** * Return if all of the Clusters in the parts in the filePath have converged or not * * @param filePath * the file path to the single file containing the clusters * @return true if all Clusters are converged * @throws IOException * if there was an IO error */ private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException { for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) { SequenceFileValueIterator<ClusterWritable> iterator = new SequenceFileValueIterator<ClusterWritable>( part.getPath(), true, conf); while (iterator.hasNext()) { ClusterWritable value = iterator.next(); if (!value.getValue().isConverged()) { Closeables.close(iterator, true); return false; } } } return true; } }
/** * Return if all of the Clusters in the parts in the filePath have converged or not * * @param filePath * the file path to the single file containing the clusters * @return true if all Clusters are converged * @throws IOException * if there was an IO error */ private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException { for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) { SequenceFileValueIterator<ClusterWritable> iterator = new SequenceFileValueIterator<ClusterWritable>( part.getPath(), true, conf); while (iterator.hasNext()) { ClusterWritable value = iterator.next(); if (!value.getValue().isConverged()) { Closeables.close(iterator, true); return false; } } } return true; } }
/** * Return if all of the Clusters in the parts in the filePath have converged or not * * @param filePath * the file path to the single file containing the clusters * @return true if all Clusters are converged * @throws IOException * if there was an IO error */ private static boolean isConverged(Path filePath, Configuration conf, FileSystem fs) throws IOException { for (FileStatus part : fs.listStatus(filePath, PathFilters.partFilter())) { SequenceFileValueIterator<ClusterWritable> iterator = new SequenceFileValueIterator<>( part.getPath(), true, conf); while (iterator.hasNext()) { ClusterWritable value = iterator.next(); if (!value.getValue().isConverged()) { Closeables.close(iterator, true); return false; } } } return true; } }
/** Be sure that the buildRandomSeeded works in the same way as RandomSeedGenerator.buildRandom */ @Test public void testRandomSeedGeneratorSeeded() throws Exception { List<VectorWritable> points = getPoints(); Job job = new Job(); Configuration conf = job.getConfiguration(); job.setMapOutputValueClass(VectorWritable.class); Path input = getTestTempFilePath("random-input"); Path output = getTestTempDirPath("random-output"); ClusteringTestUtils.writePointsToFile(points, input, fs, conf); RandomSeedGenerator.buildRandom(conf, input, output, 4, new ManhattanDistanceMeasure(), 1L); int clusterCount = 0; Collection<Integer> set = Sets.newHashSet(); for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(new Path(output, "part-randomSeed"), true, conf)) { clusterCount++; Cluster cluster = clusterWritable.getValue(); int id = cluster.getId(); assertTrue(set.add(id)); // validate unique id's Vector v = cluster.getCenter(); assertVectorEquals(RAW[id], v); // validate values match } assertEquals(4, clusterCount); // validate sample count }
/** Story: test random seed generation generates 4 clusters with proper ids and data */ @Test public void testRandomSeedGenerator() throws Exception { List<VectorWritable> points = getPoints(); Job job = new Job(); Configuration conf = job.getConfiguration(); job.setMapOutputValueClass(VectorWritable.class); Path input = getTestTempFilePath("random-input"); Path output = getTestTempDirPath("random-output"); ClusteringTestUtils.writePointsToFile(points, input, fs, conf); RandomSeedGenerator.buildRandom(conf, input, output, 4, new ManhattanDistanceMeasure()); int clusterCount = 0; Collection<Integer> set = Sets.newHashSet(); for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(new Path(output, "part-randomSeed"), true, conf)) { clusterCount++; Cluster cluster = clusterWritable.getValue(); int id = cluster.getId(); assertTrue(set.add(id)); // Validate unique id's Vector v = cluster.getCenter(); assertVectorEquals(RAW[id], v); // Validate values match } assertEquals(4, clusterCount); // Validate sample count }
List<ClusterWritable> data = writer.getValue(key); ClusterWritable clusterWritable = data.get(0); Canopy canopy = (Canopy) clusterWritable.getValue(); assertEquals(euclideanCentroids.get(i).asFormatString() + " is not equal to " + canopy.computeCentroid().asFormatString(),
/** Story: User can cluster points using sequential execution */ @Test public void testClusteringManhattanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); // now run the Canopy Driver in sequential mode Path output = getTestTempDirPath("output"); CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true); // verify output from sequence file Path path = new Path(output, "clusters-0-final/part-r-00000"); int ix = 0; for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true, config)) { assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue() .getCenter()); ix++; } path = new Path(output, "clusteredPoints/part-m-0"); long count = HadoopUtil.countRecords(path, config); assertEquals("number of points", points.size(), count); }
@Test public void testEigenSeedGenerator() throws Exception { List<VectorWritable> points = getPoints(); Job job = new Job(); Configuration conf = job.getConfiguration(); job.setMapOutputValueClass(VectorWritable.class); Path input = getTestTempFilePath("eigen-input"); Path output = getTestTempDirPath("eigen-output"); ClusteringTestUtils.writePointsToFile(points, input, fs, conf); EigenSeedGenerator.buildFromEigens(conf, input, output, 3, new ManhattanDistanceMeasure()); int clusterCount = 0; Collection<Integer> set = new HashSet<Integer>(); Vector v[] = new Vector[3]; for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>( new Path(output, "part-eigenSeed"), true, conf)) { Cluster cluster = clusterWritable.getValue(); int id = cluster.getId(); assertTrue(set.add(id)); // validate unique id's v[id] = cluster.getCenter(); clusterCount++; } assertEquals(3, clusterCount); // validate sample count // validate pair-wise orthogonality assertEquals(0, v[0].dot(v[1]), 1E-10); assertEquals(0, v[1].dot(v[2]), 1E-10); assertEquals(0, v[0].dot(v[2]), 1E-10); }