org.apache.mahout.common.HadoopUtil java code examples

public static void delete(Configuration conf, Path... paths) throws IOException {
 delete(conf, Arrays.asList(paths));
}

/**
 * Constructor that uses either {@link FileSystem#listStatus(Path)} or
 * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over
 * (depending on pathType parameter).
 */
public SequenceFileDirIterator(Path path,
                PathType pathType,
                PathFilter filter,
                Comparator<FileStatus> ordering,
                boolean reuseKeyValueInstances,
                Configuration conf) throws IOException {
 FileStatus[] statuses = HadoopUtil.getFileStatus(path, pathType, filter, ordering, conf);
 iterators = Lists.newArrayList();
 init(statuses, reuseKeyValueInstances, conf);
}

public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter,
  Comparator<FileStatus> ordering, Configuration conf) throws IOException {
 FileStatus[] statuses;
 FileSystem fs = path.getFileSystem(conf);
 if (filter == null) {
  statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path);
 } else {
  statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter);
 }
 if (ordering != null) {
  Arrays.sort(statuses, ordering);
 }
 return statuses;
}

public static int runMapReduce(Configuration conf, Path input, Path output)
 throws IOException, ClassNotFoundException, InterruptedException {
 // Prepare Job for submission.
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
   StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
   StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
   conf);
 job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
   StreamingKMeansMapper.class, StreamingKMeansReducer.class));
 // There is only one reducer so that the intermediate centroids get collected on one
 // machine and are clustered in memory to get the right number of clusters.
 job.setNumReduceTasks(1);
 // Set the JAR (so that the required libraries are available) and run.
 job.setJarByClass(StreamingKMeansDriver.class);
 // Run job!
 long start = System.currentTimeMillis();
 if (!job.waitForCompletion(true)) {
  return -1;
 }
 long end = System.currentTimeMillis();
 log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
 return 0;
}

 HadoopUtil.delete(getConf(), getOutputPath());
 HadoopUtil.delete(getConf(), getTempPath());
boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY);
HadoopUtil.setSerializations(getConf());
HadoopUtil.cacheFiles(labPath, getConf());
HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());

});
numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
 numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
     PathType.LIST, null, getConf());

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 super.setup(context);
 Configuration conf = context.getConfiguration();
 Path modelPath = HadoopUtil.getSingleCachedFile(conf);
 NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, conf);
 boolean compl = Boolean.parseBoolean(conf.get(TestNaiveBayesDriver.COMPLEMENTARY));
 if (compl) {
  classifier = new ComplementaryNaiveBayesClassifier(model);
 } else {
  classifier = new StandardNaiveBayesClassifier(model);
 }
}

/**
 * Return the first cached file in the list, else null if thre are no cached files.
 * @param conf - MapReduce Configuration
 * @return Path of Cached file
 * @throws IOException - IO Exception
 */
public static Path getSingleCachedFile(Configuration conf) throws IOException {
 return getCachedFiles(conf)[0];
}

/** Story: User can cluster points using sequential execution */
@Test
public void testClusteringManhattanSeq() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration config = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points,
   getTestTempFilePath("testdata/file1"), fs, config);
 // now run the Canopy Driver in sequential mode
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(config, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true);
 // verify output from sequence file
 Path path = new Path(output, "clusters-0-final/part-r-00000");
 int ix = 0;
 for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true,
   config)) {
  assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue()
    .getCenter());
  ix++;
 }
 path = new Path(output, "clusteredPoints/part-m-0");
 long count = HadoopUtil.countRecords(path, config);
 assertEquals("number of points", points.size(), count);
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 super.setup(context);
 Configuration conf = context.getConfiguration();
 vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
 featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
 minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
 maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1);
 sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
 namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
 URI[] localFiles = DistributedCache.getCacheFiles(conf);
 Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles);
 // key is feature, value is the document frequency
 for (Pair<IntWritable,LongWritable> record 
    : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) {
  dictionary.put(record.getFirst().get(), record.getSecond().get());
 }
}

/**
 * Builds a comma-separated list of input splits
 * @param fs - File System
 * @param fileStatus - File Status
 * @param pathFilter - path filter
 * @return list of directories as a comma-separated String
 * @throws IOException - IO Exception
 */
public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException {
 boolean containsFiles = false;
 List<String> directoriesList = Lists.newArrayList();
 for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) {
  if (childFileStatus.isDir()) {
   String subDirectoryList = buildDirList(fs, childFileStatus);
   directoriesList.add(subDirectoryList);
  } else {
   containsFiles = true;
  }
 }
 if (containsFiles) {
  directoriesList.add(fileStatus.getPath().toUri().getPath());
 }
 return Joiner.on(',').skipNulls().join(directoriesList.iterator());
}

private boolean runMapReduce() throws IOException,
  InterruptedException, ClassNotFoundException {
 Path model = new Path(getOption("model"));
 HadoopUtil.cacheFiles(model, getConf());
 //the output key is the expected value, the output value are the scores for all the labels
 Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class,
   Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
 //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels"));
 boolean complementary = hasOption("testComplementary");
 testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary));
 return testJob.waitForCompletion(true);
}

 HadoopUtil.delete(getConf(), getOutputPath());
 HadoopUtil.delete(getConf(), getTempPath());
boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY);
HadoopUtil.setSerializations(getConf());
HadoopUtil.cacheFiles(labPath, getConf());
HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf());
HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());

public static int runMapReduce(Configuration conf, Path input, Path output)
 throws IOException, ClassNotFoundException, InterruptedException {
 // Prepare Job for submission.
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
   StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
   StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
   conf);
 job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
   StreamingKMeansMapper.class, StreamingKMeansReducer.class));
 // There is only one reducer so that the intermediate centroids get collected on one
 // machine and are clustered in memory to get the right number of clusters.
 job.setNumReduceTasks(1);
 // Set the JAR (so that the required libraries are available) and run.
 job.setJarByClass(StreamingKMeansDriver.class);
 // Run job!
 long start = System.currentTimeMillis();
 if (!job.waitForCompletion(true)) {
  return -1;
 }
 long end = System.currentTimeMillis();
 log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
 return 0;
}

});
numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
 numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS),
     PathType.LIST, null, getConf());

public static Map<String,Vector> readScoresFromCache(Configuration conf) throws IOException {
 Map<String,Vector> sumVectors = Maps.newHashMap();
 for (Pair<Text,VectorWritable> entry
   : new SequenceFileDirIterable<Text,VectorWritable>(HadoopUtil.getSingleCachedFile(conf),
    PathType.LIST, PathFilters.partFilter(), conf)) {
  sumVectors.put(entry.getFirst().toString(), entry.getSecond().get());
 }
 return sumVectors;
}

/**
 * Return the first cached file in the list, else null if thre are no cached files.
 * @param conf - MapReduce Configuration
 * @return Path of Cached file
 * @throws IOException - IO Exception
 */
public static Path getSingleCachedFile(Configuration conf) throws IOException {
 return getCachedFiles(conf)[0];
}

/**
 * Story: User can produce final point clustering using a Hadoop map/reduce
 * job and a ManhattanDistanceMeasure.
 */
@Test
public void testClusteringManhattanMR() throws Exception {
 List<VectorWritable> points = getPointsWritable();
 Configuration conf = getConfiguration();
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file1"), fs, conf);
 ClusteringTestUtils.writePointsToFile(points, true, 
   getTestTempFilePath("testdata/file2"), fs, conf);
 // now run the Job
 Path output = getTestTempDirPath("output");
 CanopyDriver.run(conf, getTestTempDirPath("testdata"), output,
   manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false);
 Path path = new Path(output, "clusteredPoints/part-m-00000");
 long count = HadoopUtil.countRecords(path, conf);
 assertEquals("number of points", points.size(), count);
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 super.setup(context);
 Configuration conf = context.getConfiguration();
 vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1);
 featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1);
 minDf = conf.getInt(TFIDFConverter.MIN_DF, 1);
 maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1);
 sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false);
 namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false);
 URI[] localFiles = DistributedCache.getCacheFiles(conf);
 Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles);
 // key is feature, value is the document frequency
 for (Pair<IntWritable,LongWritable> record 
    : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) {
  dictionary.put(record.getFirst().get(), record.getSecond().get());
 }
}

/**
 * Builds a comma-separated list of input splits
 * @param fs - File System
 * @param fileStatus - File Status
 * @param pathFilter - path filter
 * @return list of directories as a comma-separated String
 * @throws IOException - IO Exception
 */
public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException {
 boolean containsFiles = false;
 List<String> directoriesList = Lists.newArrayList();
 for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) {
  if (childFileStatus.isDir()) {
   String subDirectoryList = buildDirList(fs, childFileStatus);
   directoriesList.add(subDirectoryList);
  } else {
   containsFiles = true;
  }
 }
 if (containsFiles) {
  directoriesList.add(fileStatus.getPath().toUri().getPath());
 }
 return Joiner.on(',').skipNulls().join(directoriesList.iterator());
}

Most used methods

delete
countRecords
Count all the records in a directory using a org.apache.mahout.common.iterator.sequencefile.Sequence
getFileStatus
listStatus
buildDirList
Builds a comma-separated list of input splits
cacheFiles
findInCacheByPartOfFilename
Finds a file in the DistributedCache
getCachedFiles
Retrieves paths to cached files.
getCustomJobName
getSingleCachedFile
Return the first cached file in the list, else null if thre are no cached files.
openStream
prepareJob
Create a map-only Hadoop Job out of the passed in parameters. Does not set the Job name.

Popular in Java

Reading from database using SQL prepared statement
getApplicationContext (Context)
findViewById (Activity)
setScale (BigDecimal)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
Top plugins for Android Studio

How to useHadoopUtil in org.apache.mahout.common

Best Java code snippets using org.apache.mahout.common.HadoopUtil (Showing top 20 results out of 315)

How to use
HadoopUtil
in
org.apache.mahout.common