org.apache.mahout.clustering.canopy.TestCanopyCreation.getConfiguration java code examples

/**
 * Story: User can specify a clustering limit that prevents output of small
 * clusters
 */
@Test
public void testCanopyMapperClusterFilter() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
   .getClass().getName());
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
 conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
 conf.set(CanopyConfigKeys.CF_KEY, "3");
 DummyRecordWriter<Text, VectorWritable> writer = new DummyRecordWriter<Text, VectorWritable>();
 Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable>.Context context = DummyRecordWriter
   .build(mapper, conf, writer);
 mapper.setup(context);
 List<VectorWritable> points = getPointsWritable();
 // map the data
 for (VectorWritable point : points) {
  mapper.map(new Text(), point, context);
 }
 mapper.cleanup(context);
 assertEquals("Number of map results", 1, writer.getData().size());
 // now verify the output
 List<VectorWritable> data = writer.getValue(new Text("centroid"));
 assertEquals("Number of centroids", 2, data.size());
}

/**
 * Story: User can set T3 and T4 values to be used by the reducer for its T1
 * and T2 thresholds
 */
@Test
public void testCanopyReducerT3T4Configuration() throws Exception {
 CanopyReducer reducer = new CanopyReducer();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
   "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
 conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
 conf.set(CanopyConfigKeys.T3_KEY, String.valueOf(1.1));
 conf.set(CanopyConfigKeys.T4_KEY, String.valueOf(0.1));
 conf.set(CanopyConfigKeys.CF_KEY, "0");
 DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>();
 Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter
   .build(reducer, conf, writer, Text.class, VectorWritable.class);
 reducer.setup(context);
 assertEquals(1.1, reducer.getCanopyClusterer().getT1(), EPSILON);
 assertEquals(0.1, reducer.getCanopyClusterer().getT2(), EPSILON);
}

 /**
  * Story: User can specify a cluster filter that limits the minimum size of
  * canopies produced by the reducer
  */
 @Test
 public void testCanopyReducerClusterFilter() throws Exception {
  CanopyReducer reducer = new CanopyReducer();
  Configuration conf = getConfiguration();
  conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
    "org.apache.mahout.common.distance.ManhattanDistanceMeasure");
  conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));
  conf.set(CanopyConfigKeys.T2_KEY, String.valueOf(2.1));
  conf.set(CanopyConfigKeys.CF_KEY, "3");
  DummyRecordWriter<Text, ClusterWritable> writer = new DummyRecordWriter<Text, ClusterWritable>();
  Reducer<Text, VectorWritable, Text, ClusterWritable>.Context context = DummyRecordWriter
    .build(reducer, conf, writer, Text.class, VectorWritable.class);
  reducer.setup(context);

  List<VectorWritable> points = getPointsWritable();
  reducer.reduce(new Text("centroid"), points, context);
  Set<Text> keys = writer.getKeys();
  assertEquals("Number of centroids", 2, keys.size());
 }
}

public void testCanopyMapperManhattan() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, manhattanDistanceMeasure
   .getClass().getName());

public void testCanopyMapperEuclidean() throws Exception {
 CanopyMapper mapper = new CanopyMapper();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, euclideanDistanceMeasure
   .getClass().getName());

public void testCanopyReducerManhattan() throws Exception {
 CanopyReducer reducer = new CanopyReducer();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY,
   "org.apache.mahout.common.distance.ManhattanDistanceMeasure");

public void testCanopyReducerEuclidean() throws Exception {
 CanopyReducer reducer = new CanopyReducer();
 Configuration conf = getConfiguration();
 conf.set(CanopyConfigKeys.DISTANCE_MEASURE_KEY, "org.apache.mahout.common.distance.EuclideanDistanceMeasure");
 conf.set(CanopyConfigKeys.T1_KEY, String.valueOf(3.1));

@Override
@Before
public void setUp() throws Exception {
 super.setUp();
 fs = FileSystem.get(getConfiguration());
 referenceManhattan = CanopyClusterer.createCanopies(getPoints(),
   manhattanDistanceMeasure, 3.1, 2.1);
 manhattanCentroids = CanopyClusterer.getCenters(referenceManhattan);
 referenceEuclidean = CanopyClusterer.createCanopies(getPoints(),
   euclideanDistanceMeasure, 3.1, 2.1);
 euclideanCentroids = CanopyClusterer.getCenters(referenceEuclidean);
}

/** Story: User can cluster points using sequential execution */
@Test
public void testClusteringManhattanSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);
 // now run the Canopy Driver in sequential mode
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true);
 // verify output from sequence file
 Path path = new Path(output, "clusters-0-final/part-r-00000");
 int ix = 0;
 for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
   config)) {
  assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue()
    .getCenter());
  ix++;
 }
 path = new Path(output, "clusteredPoints/part-m-0");
 long count = HadoopUtil.countRecords(path, config);
 assertEquals("number of points", points.size(), count);
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure.
 */
@Test
public void testClusteringEuclideanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure and outlier removal threshold.
 */
@Test
public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.7",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 int expectedPointsAfterOutlierRemoval = 8;
 assertEquals("number of points", expectedPointsAfterOutlierRemoval, count);
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a ManhattanDistanceMeasure.
 */
@Test
public void testClusteringManhattanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

public void testClusteringEuclideanSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);

public void testClusteringEuclideanWithOutlierRemovalSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);

public void testCanopyGenManhattanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);

public void testCanopyGenEuclideanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);

Popular methods of TestCanopyCreation

assertEquals
assertFalse
assertTrue
findAndRemove
getPoints
getPointsWritable
getTestTempDirPath
getTestTempFilePath
optKey
printCanopies
Print the canopies to the transcript

Popular in Java

Making http post requests using okhttp
setScale (BigDecimal)
getSystemService (Context)
onRequestPermissionsResult (Fragment)
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
JCheckBox (javax.swing)
JFileChooser (javax.swing)
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
From CI to AI: The AI layer in your organization

How to use getConfigurationmethodin org.apache.mahout.clustering.canopy.TestCanopyCreation

Best Java code snippets using org.apache.mahout.clustering.canopy.TestCanopyCreation.getConfiguration (Showing top 16 results out of 315)

How to use
getConfiguration
method
in
org.apache.mahout.clustering.canopy.TestCanopyCreation