public static void delete(Configuration conf, Path... paths) throws IOException { delete(conf, Arrays.asList(paths)); }
/** * Constructor that uses either {@link FileSystem#listStatus(Path)} or * {@link FileSystem#globStatus(Path)} to obtain list of files to iterate over * (depending on pathType parameter). */ public SequenceFileDirIterator(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, boolean reuseKeyValueInstances, Configuration conf) throws IOException { FileStatus[] statuses = HadoopUtil.getFileStatus(path, pathType, filter, ordering, conf); iterators = Lists.newArrayList(); init(statuses, reuseKeyValueInstances, conf); }
public static FileStatus[] getFileStatus(Path path, PathType pathType, PathFilter filter, Comparator<FileStatus> ordering, Configuration conf) throws IOException { FileStatus[] statuses; FileSystem fs = path.getFileSystem(conf); if (filter == null) { statuses = pathType == PathType.GLOB ? fs.globStatus(path) : listStatus(fs, path); } else { statuses = pathType == PathType.GLOB ? fs.globStatus(path, filter) : listStatus(fs, path, filter); } if (ordering != null) { Arrays.sort(statuses, ordering); } return statuses; }
public static int runMapReduce(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { // Prepare Job for submission. Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class, StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job, StreamingKMeansMapper.class, StreamingKMeansReducer.class)); // There is only one reducer so that the intermediate centroids get collected on one // machine and are clustered in memory to get the right number of clusters. job.setNumReduceTasks(1); // Set the JAR (so that the required libraries are available) and run. job.setJarByClass(StreamingKMeansDriver.class); // Run job! long start = System.currentTimeMillis(); if (!job.waitForCompletion(true)) { return -1; } long end = System.currentTimeMillis(); log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start); return 0; }
HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());
}); numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null, getConf());
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); Path modelPath = HadoopUtil.getSingleCachedFile(conf); NaiveBayesModel model = NaiveBayesModel.materialize(modelPath, conf); boolean compl = Boolean.parseBoolean(conf.get(TestNaiveBayesDriver.COMPLEMENTARY)); if (compl) { classifier = new ComplementaryNaiveBayesClassifier(model); } else { classifier = new StandardNaiveBayesClassifier(model); } }
/** * Return the first cached file in the list, else null if thre are no cached files. * @param conf - MapReduce Configuration * @return Path of Cached file * @throws IOException - IO Exception */ public static Path getSingleCachedFile(Configuration conf) throws IOException { return getCachedFiles(conf)[0]; }
/** Story: User can cluster points using sequential execution */ @Test public void testClusteringManhattanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); // now run the Canopy Driver in sequential mode Path output = getTestTempDirPath("output"); CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true); // verify output from sequence file Path path = new Path(output, "clusters-0-final/part-r-00000"); int ix = 0; for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true, config)) { assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue() .getCenter()); ix++; } path = new Path(output, "clusteredPoints/part-m-0"); long count = HadoopUtil.countRecords(path, config); assertEquals("number of points", points.size(), count); }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles); // key is feature, value is the document frequency for (Pair<IntWritable,LongWritable> record : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); } }
/** * Builds a comma-separated list of input splits * @param fs - File System * @param fileStatus - File Status * @param pathFilter - path filter * @return list of directories as a comma-separated String * @throws IOException - IO Exception */ public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException { boolean containsFiles = false; List<String> directoriesList = Lists.newArrayList(); for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) { if (childFileStatus.isDir()) { String subDirectoryList = buildDirList(fs, childFileStatus); directoriesList.add(subDirectoryList); } else { containsFiles = true; } } if (containsFiles) { directoriesList.add(fileStatus.getPath().toUri().getPath()); } return Joiner.on(',').skipNulls().join(directoriesList.iterator()); }
private boolean runMapReduce() throws IOException, InterruptedException, ClassNotFoundException { Path model = new Path(getOption("model")); HadoopUtil.cacheFiles(model, getConf()); //the output key is the expected value, the output value are the scores for all the labels Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels")); boolean complementary = hasOption("testComplementary"); testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary)); return testJob.waitForCompletion(true); }
HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); boolean trainComplementary = hasOption(TRAIN_COMPLEMENTARY); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); HadoopUtil.cacheFiles(getTempPath(THETAS), getConf());
public static int runMapReduce(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { // Prepare Job for submission. Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class, StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job, StreamingKMeansMapper.class, StreamingKMeansReducer.class)); // There is only one reducer so that the intermediate centroids get collected on one // machine and are clustered in memory to get the right number of clusters. job.setNumReduceTasks(1); // Set the JAR (so that the required libraries are available) and run. job.setJarByClass(StreamingKMeansDriver.class); // Run job! long start = System.currentTimeMillis(); if (!job.waitForCompletion(true)) { return -1; } long end = System.currentTimeMillis(); log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start); return 0; }
}); numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); numberOfUsers = (int) HadoopUtil.countRecords(new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null, getConf());
public static Map<String,Vector> readScoresFromCache(Configuration conf) throws IOException { Map<String,Vector> sumVectors = Maps.newHashMap(); for (Pair<Text,VectorWritable> entry : new SequenceFileDirIterable<Text,VectorWritable>(HadoopUtil.getSingleCachedFile(conf), PathType.LIST, PathFilters.partFilter(), conf)) { sumVectors.put(entry.getFirst().toString(), entry.getSecond().get()); } return sumVectors; }
/** * Return the first cached file in the list, else null if thre are no cached files. * @param conf - MapReduce Configuration * @return Path of Cached file * @throws IOException - IO Exception */ public static Path getSingleCachedFile(Configuration conf) throws IOException { return getCachedFiles(conf)[0]; }
/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a ManhattanDistanceMeasure. */ @Test public void testClusteringManhattanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job Path output = getTestTempDirPath("output"); CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); assertEquals("number of points", points.size(), count); }
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); vectorCount = conf.getLong(TFIDFConverter.VECTOR_COUNT, 1); featureCount = conf.getLong(TFIDFConverter.FEATURE_COUNT, 1); minDf = conf.getInt(TFIDFConverter.MIN_DF, 1); maxDf = conf.getLong(TFIDFConverter.MAX_DF, -1); sequentialAccess = conf.getBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, false); namedVector = conf.getBoolean(PartialVectorMerger.NAMED_VECTOR, false); URI[] localFiles = DistributedCache.getCacheFiles(conf); Path dictionaryFile = HadoopUtil.findInCacheByPartOfFilename(TFIDFConverter.FREQUENCY_FILE, localFiles); // key is feature, value is the document frequency for (Pair<IntWritable,LongWritable> record : new SequenceFileIterable<IntWritable,LongWritable>(dictionaryFile, true, conf)) { dictionary.put(record.getFirst().get(), record.getSecond().get()); } }
/** * Builds a comma-separated list of input splits * @param fs - File System * @param fileStatus - File Status * @param pathFilter - path filter * @return list of directories as a comma-separated String * @throws IOException - IO Exception */ public static String buildDirList(FileSystem fs, FileStatus fileStatus, PathFilter pathFilter) throws IOException { boolean containsFiles = false; List<String> directoriesList = Lists.newArrayList(); for (FileStatus childFileStatus : fs.listStatus(fileStatus.getPath(), pathFilter)) { if (childFileStatus.isDir()) { String subDirectoryList = buildDirList(fs, childFileStatus); directoriesList.add(subDirectoryList); } else { containsFiles = true; } } if (containsFiles) { directoriesList.add(fileStatus.getPath().toUri().getPath()); } return Joiner.on(',').skipNulls().join(directoriesList.iterator()); }