org.apache.mahout.clustering.canopy.TestCanopyCreation.getPointsWritable java code examples

/**
 * Story: User can specify a clustering limit that prevents output of small
 * clusters
 */
@Test
public void testCanopyMapperClusterFilter() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
   .getClass().getName());
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
 conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
 conf.set(CanopyConfigKeys.CF_KEY, "3");
 DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
 Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter
   .build(mapper, conf, writer);
 mapper.setup(context);
 List<VectorWritable> points = getPointsWritable();
 // map the data
 for (VectorWritable point : points) {
  mapper.map(new Text(), point, context);
 }
 mapper.cleanup(context);
 assertEquals("Number of map results", 1, writer.getData().size());
 // now verify the output
 List<VectorWritable> data = writer.getValue(new Text("centroid"));
 assertEquals("Number of centroids", 2, data.size());
}

 /**
  * Story: User can specify a cluster filter that limits the minimum size of
  * canopies produced by the reducer
  */
 @Test
 public void testCanopyReducerClusterFilter() throws Exception {
  CanopyReducer reducer = new CanopyReducer();
  Configuration conf = getConfiguration();
  conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
    "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
  conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
  conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
  conf.set(CanopyConfigKeys.CF_KEY, "3");
  DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>();
  Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter
    .build(reducer, conf, writer, Text.class, VectorWritable.class);
  reducer.setup(context);

  List<VectorWritable> points = getPointsWritable();
  reducer.reduce(new Text("centroid"), points, context);
  Set<Text> keys = writer.getKeys();
  assertEquals("Number of centroids", 2, keys.size());
 }
}

mapper.setup(context);
List<VectorWritable> points = getPointsWritable();

mapper.setup(context);
List<VectorWritable> points = getPointsWritable();

reducer.setup(context);
List<VectorWritable> points = getPointsWritable();
reducer.reduce(new Text("centroid"), points, context);
Iterable<Text> keys = writer.getKeysInInsertionOrder();

reducer.setup(context);
List<VectorWritable> points = getPointsWritable();
reducer.reduce(new Text("centroid"), points, context);
Iterable<Text> keys = writer.getKeysInInsertionOrder();

/** Story: User can cluster points using sequential execution */
@Test
public void testClusteringManhattanSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);
 // now run the Canopy Driver in sequential mode
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true);
 // verify output from sequence file
 Path path = new Path(output, "clusters-0-final/part-r-00000");
 int ix = 0;
 for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
   config)) {
  assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue()
    .getCenter());
  ix++;
 }
 path = new Path(output, "clusteredPoints/part-m-0");
 long count = HadoopUtil.countRecords(path, config);
 assertEquals("number of points", points.size(), count);
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a ManhattanDistanceMeasure.
 */
@Test
public void testClusteringManhattanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure.
 */
@Test
public void testClusteringEuclideanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure and outlier removal threshold.
 */
@Test
public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.7",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 int expectedPointsAfterOutlierRemoval = 8;
 assertEquals("number of points", expectedPointsAfterOutlierRemoval, count);
}

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,

List<VectorWritable> points = getPointsWritable();
Configuration config = getConfiguration();
ClusteringTestUtils.writePointsToFile(points,

Popular methods of TestCanopyCreation

assertEquals
assertFalse
assertTrue
findAndRemove
getConfiguration
getPoints
getTestTempDirPath
getTestTempFilePath
optKey
printCanopies
Print the canopies to the transcript

Popular in Java

Reading from database using SQL prepared statement
getApplicationContext (Context)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
getContentResolver (Context)
BigInteger (java.math)
An immutable arbitrary-precision signed integer.FAST CRYPTOGRAPHY This implementation is efficient f
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
JButton (javax.swing)
Top Vim plugins

How to use getPointsWritablemethodin org.apache.mahout.clustering.canopy.TestCanopyCreation

Best Java code snippets using org.apache.mahout.clustering.canopy.TestCanopyCreation.getPointsWritable (Showing top 14 results out of 315)

How to use
getPointsWritable
method
in
org.apache.mahout.clustering.canopy.TestCanopyCreation