/** * Create a {@link CanopyClusterer} from the Hadoop configuration. * * @param configuration Hadoop configuration * * @return CanopyClusterer */ public static CanopyClusterer configureCanopyClusterer(Configuration configuration) { double t1 = Double.parseDouble(configuration.get(T1_KEY)); double t2 = Double.parseDouble(configuration.get(T2_KEY)); DistanceMeasure measure = ClassUtils.instantiateAs(configuration.get(DISTANCE_MEASURE_KEY), DistanceMeasure.class); measure.configure(configuration); CanopyClusterer canopyClusterer = new CanopyClusterer(measure, t1, t2); String d = configuration.get(T3_KEY); if (d != null) { canopyClusterer.setT3(Double.parseDouble(d)); } d = configuration.get(T4_KEY); if (d != null) { canopyClusterer.setT4(Double.parseDouble(d)); } return canopyClusterer; }
@Override protected void map(WritableComparable<?> key, VectorWritable point, Context context) throws IOException, InterruptedException { canopyClusterer.addPointToCanopies(point.get(), canopies); }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); canopyClusterer = new CanopyClusterer(context.getConfiguration()); canopyClusterer.useT3T4(); clusterFilter = Integer.parseInt(context.getConfiguration().get( CanopyConfigKeys.CF_KEY)); }
DistanceMeasure measure, double t1, double t2, int clusterFilter) throws IOException { CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2); Collection<Canopy> canopies = Lists.newArrayList(); Configuration conf = new Configuration(); clusterer.addPointToCanopies(vw.get(), canopies);
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); canopyClusterer = CanopyConfigKeys.configureCanopyClusterer(context.getConfiguration()); canopyClusterer.useT3T4(); clusterFilter = Integer.parseInt(context.getConfiguration().get( CanopyConfigKeys.CF_KEY)); }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); canopyClusterer = new CanopyClusterer(context.getConfiguration()); clusterFilter = Integer.parseInt(context.getConfiguration().get( CanopyConfigKeys.CF_KEY)); }
public static void main(String args[]) throws Exception { String inputDir = "reuters"; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); String vectorsFolder = inputDir + "/tfidf-vectors"; SequenceFile.Reader reader = new SequenceFile.Reader(fs, new Path(vectorsFolder + "/part-r-00000"), conf); List<Vector> points = new ArrayList<Vector>(); Text key = new Text(); VectorWritable value = new VectorWritable(); while (reader.next(key, value)) { points.add(value.get()); } System.out.println(points.size()); reader.close(); List<Canopy> canopies = CanopyClusterer.createCanopies(points, new CosineDistanceMeasure(), 0.8, 0.7); List<Cluster> clusters = new ArrayList<Cluster>(); System.out.println(canopies.size()); for (Canopy canopy : canopies) { clusters.add(new Cluster(canopy.getCenter(), canopy.getId(), new CosineDistanceMeasure())); } } }
@Override @Before public void setUp() throws Exception { super.setUp(); fs = FileSystem.get(getConfiguration()); referenceManhattan = CanopyClusterer.createCanopies(getPoints(), manhattanDistanceMeasure, 3.1, 2.1); manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan); referenceEuclidean = CanopyClusterer.createCanopies(getPoints(), euclideanDistanceMeasure, 3.1, 2.1); euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean); }
/** * Story: User can set T3 and T4 values to be used by the reducer for its T1 * and T2 thresholds */ @Test public void testCanopyReducerT3T4Configuration() throws Exception { CanopyReducer reducer = new CanopyReducer(); Configuration conf = getConfiguration(); conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.ManhattanDistanceMeasure"); conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1)); conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1)); conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1)); conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1)); conf.set(CanopyConfigKeys.CF_KEY, "0"); DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>(); Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter .build(reducer, conf, writer, Text.class, VectorWritable.class); reducer.setup(context); assertEquals(1.1, reducer.getCanopyClusterer().getT1(), EPSILON); assertEquals(0.1, reducer.getCanopyClusterer().getT2(), EPSILON); }
public CanopyClusterer(Configuration config) { this.configure(config); }
DistanceMeasure measure, double t1, double t2, int clusterFilter) throws IOException { CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2); Collection<Canopy> canopies = Lists.newArrayList(); Configuration conf = new Configuration(); clusterer.addPointToCanopies(vw.get(), canopies);
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); canopyClusterer = CanopyConfigKeys.configureCanopyClusterer(context.getConfiguration()); canopyClusterer.useT3T4(); clusterFilter = Integer.parseInt(context.getConfiguration().get( CanopyConfigKeys.CF_KEY)); }
public static void main(String[] args) { List<Vector> sampleData = new ArrayList<Vector>(); RandomPointsUtil.generateSamples(sampleData, 400, 1, 1, 2); RandomPointsUtil.generateSamples(sampleData, 300, 1, 0, 0.5); RandomPointsUtil.generateSamples(sampleData, 300, 0, 2, 0.1); List<Canopy> canopies = CanopyClusterer.createCanopies(sampleData, new EuclideanDistanceMeasure(), 3.0, 1.5); for (Canopy canopy : canopies) { System.out.println("Canopy id: " + canopy.getId() + " center: " + canopy.getCenter().asFormatString()); } } }
DistanceMeasure measure, double t1, double t2, int clusterFilter) throws IOException { CanopyClusterer clusterer = new CanopyClusterer(measure, t1, t2); Collection<Canopy> canopies = Lists.newArrayList(); Configuration conf = new Configuration(); clusterer.addPointToCanopies(vw.get(), canopies);
/** * Create a {@link CanopyClusterer} from the Hadoop configuration. * * @param configuration Hadoop configuration * * @return CanopyClusterer */ public static CanopyClusterer configureCanopyClusterer(Configuration configuration) { double t1 = Double.parseDouble(configuration.get(T1_KEY)); double t2 = Double.parseDouble(configuration.get(T2_KEY)); DistanceMeasure measure = ClassUtils.instantiateAs(configuration.get(DISTANCE_MEASURE_KEY), DistanceMeasure.class); measure.configure(configuration); CanopyClusterer canopyClusterer = new CanopyClusterer(measure, t1, t2); String d = configuration.get(T3_KEY); if (d != null) { canopyClusterer.setT3(Double.parseDouble(d)); } d = configuration.get(T4_KEY); if (d != null) { canopyClusterer.setT4(Double.parseDouble(d)); } return canopyClusterer; }
@Override protected void map(WritableComparable<?> key, VectorWritable point, Context context) throws IOException, InterruptedException { canopyClusterer.addPointToCanopies(point.get(), canopies); }
reader.close(); List<Canopy> canopies = CanopyClusterer.createCanopies(points, new CosineDistanceMeasure(), 0.7, 0.5); List<Cluster> clusters = new ArrayList<Cluster>(); System.out.println(canopies.size());
@Override protected void map(WritableComparable<?> key, VectorWritable point, Context context) throws IOException, InterruptedException { canopyClusterer.addPointToCanopies(point.get(), canopies); }
@Override protected void reduce(Text arg0, Iterable<VectorWritable> values, Context context) throws IOException, InterruptedException { for (VectorWritable value : values) { Vector point = value.get(); canopyClusterer.addPointToCanopies(point, canopies); } for (Canopy canopy : canopies) { canopy.computeParameters(); if (canopy.getNumObservations() > clusterFilter) { ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(canopy); context.write(new Text(canopy.getIdentifier()), clusterWritable); } } }
@Override protected void reduce(Text arg0, Iterable<VectorWritable> values, Context context) throws IOException, InterruptedException { for (VectorWritable value : values) { Vector point = value.get(); canopyClusterer.addPointToCanopies(point, canopies); } for (Canopy canopy : canopies) { canopy.computeParameters(); if (canopy.getNumObservations() > clusterFilter) { ClusterWritable clusterWritable = new ClusterWritable(); clusterWritable.setValue(canopy); context.write(new Text(canopy.getIdentifier()), clusterWritable); } } }