@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); String clustersIn = conf.get(ClusterClassificationConfigKeys.CLUSTERS_IN); threshold = conf.getFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, 0.0f); emitMostLikely = conf.getBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, false); clusterModels = Lists.newArrayList(); if (clustersIn != null && !clustersIn.isEmpty()) { Path clustersInPath = new Path(clustersIn); clusterModels = populateClusterModels(clustersInPath, conf); ClusteringPolicy policy = ClusterClassifier .readPolicy(finalClustersPath(clustersInPath)); clusterClassifier = new ClusterClassifier(clusterModels, policy); } clusterId = new IntWritable(); }
/** * Mapper which classifies the vectors to respective clusters. */ @Override protected void map(WritableComparable<?> key, VectorWritable vw, Context context) throws IOException, InterruptedException { if (!clusterModels.isEmpty()) { // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point // belongs to which cluster - fix for MAHOUT-1410 Class<? extends Vector> vectorClass = vw.get().getClass(); Vector vector = vw.get(); if (!vectorClass.equals(NamedVector.class)) { if (key.getClass().equals(Text.class)) { vector = new NamedVector(vector, key.toString()); } else if (key.getClass().equals(IntWritable.class)) { vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); } } Vector pdfPerCluster = clusterClassifier.classify(vector); if (shouldClassify(pdfPerCluster)) { if (emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); write(new VectorWritable(vector), context, maxValueIndex, 1.0); } else { writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster); } } } }
private void writeAllAboveThreshold(VectorWritable vw, Context context, Vector pdfPerCluster) throws IOException, InterruptedException { for (Element pdf : pdfPerCluster.nonZeroes()) { if (pdf.get() >= threshold) { int clusterIndex = pdf.index(); write(vw, context, clusterIndex, pdf.get()); } } }
/** * Mapper which classifies the vectors to respective clusters. */ @Override protected void map(WritableComparable<?> key, VectorWritable vw, Context context) throws IOException, InterruptedException { if (!clusterModels.isEmpty()) { // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point // belongs to which cluster - fix for MAHOUT-1410 Class<? extends Vector> vectorClass = vw.get().getClass(); Vector vector = vw.get(); if (!vectorClass.equals(NamedVector.class)) { if (key.getClass().equals(Text.class)) { vector = new NamedVector(vector, key.toString()); } else if (key.getClass().equals(IntWritable.class)) { vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); } } Vector pdfPerCluster = clusterClassifier.classify(vector); if (shouldClassify(pdfPerCluster)) { if (emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); write(new VectorWritable(vector), context, maxValueIndex, 1.0); } else { writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster); } } } }
private void writeAllAboveThreshold(VectorWritable vw, Context context, Vector pdfPerCluster) throws IOException, InterruptedException { for (Element pdf : pdfPerCluster.nonZeroes()) { if (pdf.get() >= threshold) { int clusterIndex = pdf.index(); write(vw, context, clusterIndex, pdf.get()); } } }
/** * Mapper which classifies the vectors to respective clusters. */ @Override protected void map(WritableComparable<?> key, VectorWritable vw, Context context) throws IOException, InterruptedException { if (!clusterModels.isEmpty()) { // Converting to NamedVectors to preserve the vectorId else its not obvious as to which point // belongs to which cluster - fix for MAHOUT-1410 Class<? extends Vector> vectorClass = vw.get().getClass(); Vector vector = vw.get(); if (!vectorClass.equals(NamedVector.class)) { if (key.getClass().equals(Text.class)) { vector = new NamedVector(vector, key.toString()); } else if (key.getClass().equals(IntWritable.class)) { vector = new NamedVector(vector, Integer.toString(((IntWritable) key).get())); } } Vector pdfPerCluster = clusterClassifier.classify(vector); if (shouldClassify(pdfPerCluster)) { if (emitMostLikely) { int maxValueIndex = pdfPerCluster.maxValueIndex(); write(new VectorWritable(vector), context, maxValueIndex, 1.0); } else { writeAllAboveThreshold(new VectorWritable(vector), context, pdfPerCluster); } } } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); String clustersIn = conf.get(ClusterClassificationConfigKeys.CLUSTERS_IN); threshold = conf.getFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, 0.0f); emitMostLikely = conf.getBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, false); clusterModels = Lists.newArrayList(); if (clustersIn != null && !clustersIn.isEmpty()) { Path clustersInPath = new Path(clustersIn); clusterModels = populateClusterModels(clustersInPath, conf); ClusteringPolicy policy = ClusterClassifier .readPolicy(finalClustersPath(clustersInPath)); clusterClassifier = new ClusterClassifier(clusterModels, policy); } clusterId = new IntWritable(); }
private void writeAllAboveThreshold(VectorWritable vw, Context context, Vector pdfPerCluster) throws IOException, InterruptedException { for (Element pdf : pdfPerCluster.nonZeroes()) { if (pdf.get() >= threshold) { int clusterIndex = pdf.index(); write(vw, context, clusterIndex, pdf.get()); } } }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); String clustersIn = conf.get(ClusterClassificationConfigKeys.CLUSTERS_IN); threshold = conf.getFloat(ClusterClassificationConfigKeys.OUTLIER_REMOVAL_THRESHOLD, 0.0f); emitMostLikely = conf.getBoolean(ClusterClassificationConfigKeys.EMIT_MOST_LIKELY, false); clusterModels = new ArrayList<>(); if (clustersIn != null && !clustersIn.isEmpty()) { Path clustersInPath = new Path(clustersIn); clusterModels = populateClusterModels(clustersInPath, conf); ClusteringPolicy policy = ClusterClassifier .readPolicy(finalClustersPath(clustersInPath)); clusterClassifier = new ClusterClassifier(clusterModels, policy); } clusterId = new IntWritable(); }