public static Job buildTransposeJob(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath, int numInputRows) throws IOException { Job job = HadoopUtil.prepareJob(matrixInputPath, matrixOutputPath, SequenceFileInputFormat.class, TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(MergeVectorsCombiner.class); job.getConfiguration().setInt(TransposeMapper.NEW_NUM_COLS_PARAM, numInputRows); job.setJobName("TransposeJob: " + matrixInputPath); return job; }
public static Job buildTransposeJob(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath, int numInputRows) throws IOException { Job job = HadoopUtil.prepareJob(matrixInputPath, matrixOutputPath, SequenceFileInputFormat.class, TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(MergeVectorsCombiner.class); job.getConfiguration().setInt(TransposeMapper.NEW_NUM_COLS_PARAM, numInputRows); job.setJobName("TransposeJob: " + matrixInputPath); return job; }
public static int runMapReduce(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { // Prepare Job for submission. Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class, StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job, StreamingKMeansMapper.class, StreamingKMeansReducer.class)); // There is only one reducer so that the intermediate centroids get collected on one // machine and are clustered in memory to get the right number of clusters. job.setNumReduceTasks(1); // Set the JAR (so that the required libraries are available) and run. job.setJarByClass(StreamingKMeansDriver.class); // Run job! long start = System.currentTimeMillis(); if (!job.waitForCompletion(true)) { return -1; } long end = System.currentTimeMillis(); log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start); return 0; }
public static int runMapReduce(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { // Prepare Job for submission. Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class, StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job, StreamingKMeansMapper.class, StreamingKMeansReducer.class)); // There is only one reducer so that the intermediate centroids get collected on one // machine and are clustered in memory to get the right number of clusters. job.setNumReduceTasks(1); // Set the JAR (so that the required libraries are available) and run. job.setJarByClass(StreamingKMeansDriver.class); // Run job! long start = System.currentTimeMillis(); if (!job.waitForCompletion(true)) { return -1; } long end = System.currentTimeMillis(); log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start); return 0; }
public static int runMapReduce(Configuration conf, Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { // Prepare Job for submission. Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class, StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job, StreamingKMeansMapper.class, StreamingKMeansReducer.class)); // There is only one reducer so that the intermediate centroids get collected on one // machine and are clustered in memory to get the right number of clusters. job.setNumReduceTasks(1); // Set the JAR (so that the required libraries are available) and run. job.setJarByClass(StreamingKMeansDriver.class); // Run job! long start = System.currentTimeMillis(); if (!job.waitForCompletion(true)) { return -1; } long end = System.currentTimeMillis(); log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start); return 0; }
private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF, long minDF, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(MAX_DF, maxDF); conf.setLong(MIN_DF, minDF); DistributedCache.setCacheFiles( new URI[]{dictionaryFilePath.toUri()}, conf); Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, Mapper.class, null, null, WordsPrunerReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(": Prune Vectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF, long minDF, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(MAX_DF, maxDF); conf.setLong(MIN_DF, minDF); DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf); Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, Mapper.class, null, null, WordsPrunerReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(": Prune Vectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF, long minDF, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.setLong(MAX_DF, maxDF); conf.setLong(MIN_DF, minDF); DistributedCache.setCacheFiles( new URI[]{dictionaryFilePath.toUri()}, conf); Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, Mapper.class, null, null, WordsPrunerReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, conf); job.setJobName(": Prune Vectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath.toString()); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
@Override public void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException { //do this for convenience of using prepareJob Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, EncodingMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, config.getConf()); Configuration conf = job.getConfiguration(); conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess())); conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors())); conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName()); conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName()); conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass()); conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality())); job.setNumReduceTasks(0); boolean finished = job.waitForCompletion(true); log.info("result of run: {}", finished); if (!finished) { throw new IllegalStateException("Job failed!"); } }
@Override public void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException { //do this for convenience of using prepareJob Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, EncodingMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, config.getConf()); Configuration conf = job.getConfiguration(); conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess())); conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors())); conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName()); conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName()); conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass()); conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality())); job.setNumReduceTasks(0); boolean finished = job.waitForCompletion(true); log.info("result of run: {}", finished); if (!finished) { throw new IllegalStateException("Job failed!"); } }
@Override public void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException { //do this for convenience of using prepareJob Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, EncodingMapper.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class, config.getConf()); Configuration conf = job.getConfiguration(); conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess())); conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors())); conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName()); conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName()); conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass()); conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality())); job.setNumReduceTasks(0); boolean finished = job.waitForCompletion(true); log.info("result of run: {}", finished); if (!finished) { throw new IllegalStateException("Job failed!"); } }
public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try { inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf); Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, String jobname) throws IOException { Job job = HadoopUtil.prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf()); String name = jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class); job.setJobName(name); return job; }
public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try { inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf); Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, String jobname) throws IOException { Job job = HadoopUtil.prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf()); String name = jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class); job.setJobName(name); return job; }
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = HadoopUtil.prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf()); job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class)); return job; }
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = HadoopUtil.prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf()); job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class)); return job; }
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends OutputFormat> outputFormat, String jobname) throws IOException { Job job = HadoopUtil.prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf()); String name = jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class); job.setJobName(name); return job; }
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = HadoopUtil.prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf()); job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class)); return job; }
"org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class, StandardDeviationCalculatorMapper.class, IntWritable.class, DoubleWritable.class, StandardDeviationCalculatorReducer.class, IntWritable.class, DoubleWritable.class,