ClusterClassifier classifier = new ClusterClassifier(); classifier.readFromSeqFiles(conf, priorPath); Path clustersOut = null; int iteration = 1; Vector vector = vw.get(); Vector probabilities = classifier.classify(vector); Vector weights = classifier.getPolicy().select(probabilities); classifier.train(index, vector, weights.get(index)); classifier.close(); classifier.getPolicy().update(classifier); classifier.writeToSeqFiles(clustersOut); FileSystem fs = FileSystem.get(outPath.toUri(), conf); iteration++;
@Override public void close(ClusterClassifier posterior) { for (Cluster cluster : posterior.getModels()) { cluster.computeParameters(); } }
private static void clusterData(Configuration conf, Path points, Path canopies, Path output, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), canopies); ClusterClassificationDriver.run(conf, points, output, new Path(output, PathDirectory.CLUSTERED_POINTS_DIRECTORY), clusterClassificationThreshold, true, runSequential); }
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY); classifier = new ClusterClassifier(); classifier.readFromSeqFiles(conf, new Path(priorClustersPath)); policy = classifier.getPolicy(); policy.update(classifier); super.setup(context); }
private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { List<Cluster> clusterModels = populateClusterModels(clusters, conf); ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters)); ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy); selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely); }
@Override protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException, InterruptedException { Iterator<ClusterWritable> iter = values.iterator(); Cluster first = iter.next().getValue(); // there must always be at least one while (iter.hasNext()) { Cluster cluster = iter.next().getValue(); first.observe(cluster); } List<Cluster> models = Lists.newArrayList(); models.add(first); classifier = new ClusterClassifier(models, policy); classifier.close(); context.write(key, new ClusterWritable(first)); }
Path path = new Path(priorPath, "priorClassifier"); ClusterClassifier prior = newKlusterClassifier(); prior.writeToSeqFiles(path); ClusteringPolicy policy = new KMeansClusteringPolicy(); ClusterClassifier.writePolicy(policy, path); assertEquals(3, prior.getModels().size()); System.out.println("Prior"); for (Cluster cluster : prior.getModels()) { System.out.println(cluster.asFormatString(null)); ClusterClassifier posterior = new ClusterClassifier(); String name = i == 4 ? "clusters-4-final" : "clusters-" + i; posterior.readFromSeqFiles(conf, new Path(outPath, name)); assertEquals(3, posterior.getModels().size()); for (Cluster cluster : posterior.getModels()) { System.out.println(cluster.asFormatString(null));
ClusterClassifier prior = new ClusterClassifier(clusters, policy); prior.writeToSeqFiles(priorClustersPath);
private ClusterClassifier writeAndRead(ClusterClassifier classifier) throws IOException { Path path = new Path(getTestTempDirPath(), "output"); classifier.writeToSeqFiles(path); ClusterClassifier newClassifier = new ClusterClassifier(); newClassifier.readFromSeqFiles(getConfiguration(), path); return newClassifier; }
/** * Mapper which classifies the vectors to respective clusters. */ @Override protected void map(WritableComparable<?> key, VectorWritable vw, Context context) throws IOException, InterruptedException { if (!clusterModels.isEmpty()) { // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point // belongs to which cluster - fix for MAHOUT-1410 Class<? extends Vector> vectorClass = vw.get().getClass(); Vector vector = vw.get(); if (!vectorClass.equals(NamedVector.class)) { if (key.getClass().equals(Text.class)) { vector = new NamedVector(vector, key.toString()); } else if (key.getClass().equals(IntWritable.class)) { vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); } } Vector pdfPerCluster = clusterClassifier.classify(vector); if (shouldClassify(pdfPerCluster)) { if (emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); write(new VectorWritable(vector), context, maxValueIndex, 1.0); } else { writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster); } } } }
@Override protected void map(WritableComparable<?> key, VectorWritable value, Context context) throws IOException, InterruptedException { Vector probabilities = classifier.classify(value.get()); Vector selections = policy.select(probabilities); for (Element el : selections.nonZeroes()) { classifier.train(el.index(), value.get(), el.get()); } }
ClusteringPolicy policy = ClusterClassifier.readPolicy(priorPath); Path clustersOut = null; int iteration = 1; throw new InterruptedException("Cluster Iteration " + iteration + " failed processing " + priorPath); ClusterClassifier.writePolicy(policy, clustersOut); FileSystem fs = FileSystem.get(outPath.toUri(), conf); iteration++;
private static ClusterClassifier newSoftClusterClassifier() { List<Cluster> models = Lists.newArrayList(); DistanceMeasure measure = new ManhattanDistanceMeasure(); models.add(new SoftCluster(new DenseVector(2).assign(1), 0, measure)); models.add(new SoftCluster(new DenseVector(2), 1, measure)); models.add(new SoftCluster(new DenseVector(2).assign(-1), 2, measure)); return new ClusterClassifier(models, new FuzzyKMeansClusteringPolicy()); }
public void readFromSeqFiles(Configuration conf, Path path) throws IOException { Configuration config = new Configuration(); List<Cluster> clusters = Lists.newArrayList(); for (ClusterWritable cw : new SequenceFileDirValueIterable<ClusterWritable>(path, PathType.LIST, PathFilters.logsCRCFilter(), config)) { Cluster cluster = cw.getValue(); cluster.configure(conf); clusters.add(cluster); } this.models = clusters; modelClass = models.get(0).getClass().getName(); this.policy = readPolicy(path); }
Path path = new Path(priorPath, "priorClassifier"); ClusterClassifier prior = newKlusterClassifier(); prior.writeToSeqFiles(path); assertEquals(3, prior.getModels().size()); System.out.println("Prior"); for (Cluster cluster : prior.getModels()) { System.out.println(cluster.asFormatString(null)); ClusterClassifier posterior = new ClusterClassifier(); String name = i == 4 ? "clusters-4-final" : "clusters-" + i; posterior.readFromSeqFiles(conf, new Path(outPath, name)); assertEquals(3, posterior.getModels().size()); for (Cluster cluster : posterior.getModels()) { System.out.println(cluster.asFormatString(null));
@Override protected void setup(Context context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String priorClustersPath = conf.get(ClusterIterator.PRIOR_PATH_KEY); classifier = new ClusterClassifier(); classifier.readFromSeqFiles(conf, new Path(priorClustersPath)); policy = classifier.getPolicy(); policy.update(classifier); super.setup(context); }
ClusterClassifier prior = new ClusterClassifier(clusters, policy); prior.writeToSeqFiles(priorClustersPath);
private static void classifyClusterSeq(Configuration conf, Path input, Path clusters, Path output, Double clusterClassificationThreshold, boolean emitMostLikely) throws IOException { List<Cluster> clusterModels = populateClusterModels(clusters, conf); ClusteringPolicy policy = ClusterClassifier.readPolicy(finalClustersPath(conf, clusters)); ClusterClassifier clusterClassifier = new ClusterClassifier(clusterModels, policy); selectCluster(input, clusterModels, clusterClassifier, output, clusterClassificationThreshold, emitMostLikely); }