org.apache.mahout.common.HadoopUtil.countRecords java code examples

numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
    PathType.LIST, null, getConf());

numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
    PathType.LIST, null, getConf());

numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
    PathType.LIST, null, getConf());

/** Story: User can cluster points using sequential execution */
@Test
public void testClusteringManhattanSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);
 // now run the Canopy Driver in sequential mode
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true);
 // verify output from sequence file
 Path path = new Path(output, "clusters-0-final/part-r-00000");
 int ix = 0;
 for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
   config)) {
  assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue()
    .getCenter());
  ix++;
 }
 path = new Path(output, "clusteredPoints/part-m-0");
 long count = HadoopUtil.countRecords(path, config);
 assertEquals("number of points", points.size(), count);
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a ManhattanDistanceMeasure.
 */
@Test
public void testClusteringManhattanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

long count = HadoopUtil.countRecords(path, config);
assertEquals("number of points", points.size(), count);

};
ToolRunner.run(getConfiguration(), new FuzzyKMeansDriver(), args);
long count = HadoopUtil.countRecords(new Path(output, "clusteredPoints/part-m-00000"), conf);
assertTrue(count > 0);

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure.
 */
@Test
public void testClusteringEuclideanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

};
FuzzyKMeansDriver.main(args);
long count = HadoopUtil.countRecords(new Path(output, "clusteredPoints/part-m-0"), conf);
assertTrue(count > 0);

long count = HadoopUtil.countRecords(path, config);
int expectedPointsHavingPDFGreaterThanThreshold = 6;
assertEquals("number of points", expectedPointsHavingPDFGreaterThanThreshold, count);

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a EuclideanDistanceMeasure and outlier removal threshold.
 */
@Test
public void testClusteringEuclideanWithOutlierRemovalMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job using the run() command. Others can use runJob().
 Path output = getTestTempDirPath("output");
 String[] args = { optKey(DefaultOptionCreator.INPUT_OPTION),
   getTestTempDirPath("testdata").toString(),
   optKey(DefaultOptionCreator.OUTPUT_OPTION), output.toString(),
   optKey(DefaultOptionCreator.DISTANCE_MEASURE_OPTION),
   EuclideanDistanceMeasure.class.getName(),
   optKey(DefaultOptionCreator.T1_OPTION), "3.1",
   optKey(DefaultOptionCreator.T2_OPTION), "2.1",
   optKey(DefaultOptionCreator.OUTLIER_THRESHOLD), "0.7",
   optKey(DefaultOptionCreator.CLUSTERING_OPTION),
   optKey(DefaultOptionCreator.OVERWRITE_OPTION) };
 ToolRunner.run(getConfiguration(), new CanopyDriver(), args);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 int expectedPointsAfterOutlierRemoval = 8;
 assertEquals("number of points", expectedPointsAfterOutlierRemoval, count);
}

Javadoc

Count all the records in a directory using a org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator

Popular methods of HadoopUtil

delete
getFileStatus
listStatus
buildDirList
Builds a comma-separated list of input splits
cacheFiles
findInCacheByPartOfFilename
Finds a file in the DistributedCache
getCachedFiles
Retrieves paths to cached files.
getCustomJobName
getSingleCachedFile
Return the first cached file in the list, else null if thre are no cached files.
openStream
prepareJob
Create a map-only Hadoop Job out of the passed in parameters. Does not set the Job name.
readInt

Popular in Java

Updating database using SQL prepared statement
setRequestProperty (URLConnection)
onCreateOptionsMenu (Activity)
notifyDataSetChanged (ArrayAdapter)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
URL (java.net)
A Uniform Resource Locator that identifies the location of an Internet resource as specified by RFC
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
JPanel (javax.swing)
CodeWhisperer alternatives

How to use countRecordsmethodin org.apache.mahout.common.HadoopUtil

Best Java code snippets using org.apache.mahout.common.HadoopUtil.countRecords (Showing top 11 results out of 315)

How to use
countRecords
method
in
org.apache.mahout.common.HadoopUtil