org.apache.mahout.common.HadoopUtil.prepareJob java code examples

public static Job buildTransposeJob(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath,
  int numInputRows) throws IOException {
 Job job = HadoopUtil.prepareJob(matrixInputPath, matrixOutputPath, SequenceFileInputFormat.class,
   TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
   VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
 job.setCombinerClass(MergeVectorsCombiner.class);
 job.getConfiguration().setInt(TransposeMapper.NEW_NUM_COLS_PARAM, numInputRows);
 job.setJobName("TransposeJob: " + matrixInputPath);
 return job;
}

public static Job buildTransposeJob(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath,
  int numInputRows) throws IOException {
 Job job = HadoopUtil.prepareJob(matrixInputPath, matrixOutputPath, SequenceFileInputFormat.class,
   TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
   VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
 job.setCombinerClass(MergeVectorsCombiner.class);
 job.getConfiguration().setInt(TransposeMapper.NEW_NUM_COLS_PARAM, numInputRows);
 job.setJobName("TransposeJob: " + matrixInputPath);
 return job;
}

public static int runMapReduce(Configuration conf, Path input, Path output)
 throws IOException, ClassNotFoundException, InterruptedException {
 // Prepare Job for submission.
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
   StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
   StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
   conf);
 job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
   StreamingKMeansMapper.class, StreamingKMeansReducer.class));
 // There is only one reducer so that the intermediate centroids get collected on one
 // machine and are clustered in memory to get the right number of clusters.
 job.setNumReduceTasks(1);
 // Set the JAR (so that the required libraries are available) and run.
 job.setJarByClass(StreamingKMeansDriver.class);
 // Run job!
 long start = System.currentTimeMillis();
 if (!job.waitForCompletion(true)) {
  return -1;
 }
 long end = System.currentTimeMillis();
 log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
 return 0;
}

public static int runMapReduce(Configuration conf, Path input, Path output)
 throws IOException, ClassNotFoundException, InterruptedException {
 // Prepare Job for submission.
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
   StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
   StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
   conf);
 job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
   StreamingKMeansMapper.class, StreamingKMeansReducer.class));
 // There is only one reducer so that the intermediate centroids get collected on one
 // machine and are clustered in memory to get the right number of clusters.
 job.setNumReduceTasks(1);
 // Set the JAR (so that the required libraries are available) and run.
 job.setJarByClass(StreamingKMeansDriver.class);
 // Run job!
 long start = System.currentTimeMillis();
 if (!job.waitForCompletion(true)) {
  return -1;
 }
 long end = System.currentTimeMillis();
 log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
 return 0;
}

public static int runMapReduce(Configuration conf, Path input, Path output)
 throws IOException, ClassNotFoundException, InterruptedException {
 // Prepare Job for submission.
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
   StreamingKMeansMapper.class, IntWritable.class, CentroidWritable.class,
   StreamingKMeansReducer.class, IntWritable.class, CentroidWritable.class, SequenceFileOutputFormat.class,
   conf);
 job.setJobName(HadoopUtil.getCustomJobName(StreamingKMeansDriver.class.getSimpleName(), job,
   StreamingKMeansMapper.class, StreamingKMeansReducer.class));
 // There is only one reducer so that the intermediate centroids get collected on one
 // machine and are clustered in memory to get the right number of clusters.
 job.setNumReduceTasks(1);
 // Set the JAR (so that the required libraries are available) and run.
 job.setJarByClass(StreamingKMeansDriver.class);
 // Run job!
 long start = System.currentTimeMillis();
 if (!job.waitForCompletion(true)) {
  return -1;
 }
 long end = System.currentTimeMillis();
 log.info("StreamingKMeans clustering complete. Results are in {}. Took {} ms", output.toString(), end - start);
 return 0;
}

private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF,
                    long minDF, Configuration baseConf) throws IOException, InterruptedException,
    ClassNotFoundException {
 Configuration conf = new Configuration(baseConf);
 // this conf parameter needs to be set enable serialisation of conf
 // values
 conf.set("io.serializations",
     "org.apache.hadoop.io.serializer.JavaSerialization,"
         + "org.apache.hadoop.io.serializer.WritableSerialization");
 conf.setLong(MAX_DF, maxDF);
 conf.setLong(MIN_DF, minDF);
 DistributedCache.setCacheFiles(
     new URI[]{dictionaryFilePath.toUri()}, conf);
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
     Mapper.class, null, null, WordsPrunerReducer.class,
     Text.class, VectorWritable.class, SequenceFileOutputFormat.class,
     conf);
 job.setJobName(": Prune Vectors: input-folder: " + input
     + ", dictionary-file: " + dictionaryFilePath.toString());
 HadoopUtil.delete(conf, output);
 boolean succeeded = job.waitForCompletion(true);
 if (!succeeded) {
  throw new IllegalStateException("Job failed!");
 }
}

private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF,
                    long minDF, Configuration baseConf) throws IOException, InterruptedException,
    ClassNotFoundException {
 Configuration conf = new Configuration(baseConf);
 // this conf parameter needs to be set enable serialisation of conf
 // values
 conf.set("io.serializations",
     "org.apache.hadoop.io.serializer.JavaSerialization,"
         + "org.apache.hadoop.io.serializer.WritableSerialization");
 conf.setLong(MAX_DF, maxDF);
 conf.setLong(MIN_DF, minDF);
 DistributedCache.addCacheFile(dictionaryFilePath.toUri(), conf);
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
     Mapper.class, null, null, WordsPrunerReducer.class,
     Text.class, VectorWritable.class, SequenceFileOutputFormat.class,
     conf);
 job.setJobName(": Prune Vectors: input-folder: " + input
     + ", dictionary-file: " + dictionaryFilePath.toString());
 HadoopUtil.delete(conf, output);
 boolean succeeded = job.waitForCompletion(true);
 if (!succeeded) {
  throw new IllegalStateException("Job failed!");
 }
}

private static void pruneVectorsPartial(Path input, Path output, Path dictionaryFilePath, long maxDF,
                    long minDF, Configuration baseConf) throws IOException, InterruptedException,
    ClassNotFoundException {
 Configuration conf = new Configuration(baseConf);
 // this conf parameter needs to be set enable serialisation of conf
 // values
 conf.set("io.serializations",
     "org.apache.hadoop.io.serializer.JavaSerialization,"
         + "org.apache.hadoop.io.serializer.WritableSerialization");
 conf.setLong(MAX_DF, maxDF);
 conf.setLong(MIN_DF, minDF);
 DistributedCache.setCacheFiles(
     new URI[]{dictionaryFilePath.toUri()}, conf);
 Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
     Mapper.class, null, null, WordsPrunerReducer.class,
     Text.class, VectorWritable.class, SequenceFileOutputFormat.class,
     conf);
 job.setJobName(": Prune Vectors: input-folder: " + input
     + ", dictionary-file: " + dictionaryFilePath.toString());
 HadoopUtil.delete(conf, output);
 boolean succeeded = job.waitForCompletion(true);
 if (!succeeded) {
  throw new IllegalStateException("Job failed!");
 }
}

@Override
public void createVectors(Path input, Path output, VectorizerConfig config)
 throws IOException, ClassNotFoundException, InterruptedException {
 //do this for convenience of using prepareJob
 Job job = HadoopUtil.prepareJob(input, output,
                 SequenceFileInputFormat.class,
                 EncodingMapper.class,
                 Text.class,
                 VectorWritable.class,
                 SequenceFileOutputFormat.class,
                 config.getConf());
 Configuration conf = job.getConfiguration();
 conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess()));
 conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors()));
 conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName());
 conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName());
 conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass());
 conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality()));
 job.setNumReduceTasks(0);
 boolean finished = job.waitForCompletion(true);
 log.info("result of run: {}", finished);
 if (!finished) {
  throw new IllegalStateException("Job failed!");
 }
}

@Override
public void createVectors(Path input, Path output, VectorizerConfig config)
 throws IOException, ClassNotFoundException, InterruptedException {
 //do this for convenience of using prepareJob
 Job job = HadoopUtil.prepareJob(input, output,
                 SequenceFileInputFormat.class,
                 EncodingMapper.class,
                 Text.class,
                 VectorWritable.class,
                 SequenceFileOutputFormat.class,
                 config.getConf());
 Configuration conf = job.getConfiguration();
 conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess()));
 conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors()));
 conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName());
 conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName());
 conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass());
 conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality()));
 job.setNumReduceTasks(0);
 boolean finished = job.waitForCompletion(true);
 log.info("result of run: {}", finished);
 if (!finished) {
  throw new IllegalStateException("Job failed!");
 }
}

@Override
public void createVectors(Path input, Path output, VectorizerConfig config)
 throws IOException, ClassNotFoundException, InterruptedException {
 //do this for convenience of using prepareJob
 Job job = HadoopUtil.prepareJob(input, output,
                 SequenceFileInputFormat.class,
                 EncodingMapper.class,
                 Text.class,
                 VectorWritable.class,
                 SequenceFileOutputFormat.class,
                 config.getConf());
 Configuration conf = job.getConfiguration();
 conf.set(EncodingMapper.USE_SEQUENTIAL, String.valueOf(config.isSequentialAccess()));
 conf.set(EncodingMapper.USE_NAMED_VECTORS, String.valueOf(config.isNamedVectors()));
 conf.set(EncodingMapper.ANALYZER_NAME, config.getAnalyzerClassName());
 conf.set(EncodingMapper.ENCODER_FIELD_NAME, config.getEncoderName());
 conf.set(EncodingMapper.ENCODER_CLASS, config.getEncoderClass());
 conf.set(EncodingMapper.CARDINALITY, String.valueOf(config.getCardinality()));
 job.setNumReduceTasks(0);
 boolean finished = job.waitForCompletion(true);
 log.info("result of run: {}", finished);
 if (!finished) {
  throw new IllegalStateException("Job failed!");
 }
}

public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim,
  Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass,
  Class<? extends VectorSummingReducer> redClass) throws IOException {
 FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
 matrixInputPath = fs.makeQualified(matrixInputPath);
 outputVectorPathBase = fs.makeQualified(outputVectorPathBase);
 long now = System.nanoTime();
 Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);
 SequenceFile.Writer inputVectorPathWriter = null;
 try {
  inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class,
                          VectorWritable.class);
  inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
 } finally {
  Closeables.close(inputVectorPathWriter, false);
 }
 URI ivpURI = inputVectorPath.toUri();
 DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf);
 Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
   SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass,
   NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
 job.setCombinerClass(redClass);
 job.setJobName("TimesSquaredJob: " + matrixInputPath);
 Configuration conf = job.getConfiguration();
 conf.set(INPUT_VECTOR, ivpURI.toString());
 conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
 conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);
 return job;
}

protected Job prepareJob(Path inputPath,
             Path outputPath,
             Class<? extends InputFormat> inputFormat,
             Class<? extends Mapper> mapper,
             Class<? extends Writable> mapperKey,
             Class<? extends Writable> mapperValue,
             Class<? extends OutputFormat> outputFormat,
             String jobname) throws IOException {
 Job job = HadoopUtil.prepareJob(inputPath, outputPath,
     inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());
 String name =
   jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
 job.setJobName(name);
 return job;
}

public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim,
  Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass,
  Class<? extends VectorSummingReducer> redClass) throws IOException {
 FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
 matrixInputPath = fs.makeQualified(matrixInputPath);
 outputVectorPathBase = fs.makeQualified(outputVectorPathBase);
 long now = System.nanoTime();
 Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);
 SequenceFile.Writer inputVectorPathWriter = null;
 try {
  inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class,
                          VectorWritable.class);
  inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
 } finally {
  Closeables.close(inputVectorPathWriter, false);
 }
 URI ivpURI = inputVectorPath.toUri();
 DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf);
 Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
   SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass,
   NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf);
 job.setCombinerClass(redClass);
 job.setJobName("TimesSquaredJob: " + matrixInputPath);
 Configuration conf = job.getConfiguration();
 conf.set(INPUT_VECTOR, ivpURI.toString());
 conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
 conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);
 return job;
}

protected Job prepareJob(Path inputPath,
             Path outputPath,
             Class<? extends InputFormat> inputFormat,
             Class<? extends Mapper> mapper,
             Class<? extends Writable> mapperKey,
             Class<? extends Writable> mapperValue,
             Class<? extends OutputFormat> outputFormat,
             String jobname) throws IOException {
 Job job = HadoopUtil.prepareJob(inputPath, outputPath,
     inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());
 String name =
   jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
 job.setJobName(name);
 return job;
}

protected Job prepareJob(Path inputPath,
             Path outputPath,
             Class<? extends InputFormat> inputFormat,
             Class<? extends Mapper> mapper,
             Class<? extends Writable> mapperKey,
             Class<? extends Writable> mapperValue,
             Class<? extends Reducer> reducer,
             Class<? extends Writable> reducerKey,
             Class<? extends Writable> reducerValue,
             Class<? extends OutputFormat> outputFormat) throws IOException {
 Job job = HadoopUtil.prepareJob(inputPath, outputPath,
     inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
 job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
 return job;
}

protected Job prepareJob(Path inputPath,
             Path outputPath,
             Class<? extends InputFormat> inputFormat,
             Class<? extends Mapper> mapper,
             Class<? extends Writable> mapperKey,
             Class<? extends Writable> mapperValue,
             Class<? extends Reducer> reducer,
             Class<? extends Writable> reducerKey,
             Class<? extends Writable> reducerValue,
             Class<? extends OutputFormat> outputFormat) throws IOException {
 Job job = HadoopUtil.prepareJob(inputPath, outputPath,
     inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
 job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
 return job;
}

protected Job prepareJob(Path inputPath,
             Path outputPath,
             Class<? extends InputFormat> inputFormat,
             Class<? extends Mapper> mapper,
             Class<? extends Writable> mapperKey,
             Class<? extends Writable> mapperValue,
             Class<? extends OutputFormat> outputFormat,
             String jobname) throws IOException {
 Job job = HadoopUtil.prepareJob(inputPath, outputPath,
     inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());
 String name =
   jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
 job.setJobName(name);
 return job;
}

protected Job prepareJob(Path inputPath,
             Path outputPath,
             Class<? extends InputFormat> inputFormat,
             Class<? extends Mapper> mapper,
             Class<? extends Writable> mapperKey,
             Class<? extends Writable> mapperValue,
             Class<? extends Reducer> reducer,
             Class<? extends Writable> reducerKey,
             Class<? extends Writable> reducerValue,
             Class<? extends OutputFormat> outputFormat) throws IOException {
 Job job = HadoopUtil.prepareJob(inputPath, outputPath,
     inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
 job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
 return job;
}

        "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
Job job = HadoopUtil.prepareJob(input, output, SequenceFileInputFormat.class,
  StandardDeviationCalculatorMapper.class, IntWritable.class, DoubleWritable.class,
  StandardDeviationCalculatorReducer.class, IntWritable.class, DoubleWritable.class,

Javadoc

Create a map-only Hadoop Job out of the passed in parameters. Does not set the Job name.

Popular methods of HadoopUtil

delete
countRecords
Count all the records in a directory using a org.apache.mahout.common.iterator.sequencefile.Sequence
getFileStatus
listStatus
buildDirList
Builds a comma-separated list of input splits
cacheFiles
findInCacheByPartOfFilename
Finds a file in the DistributedCache
getCachedFiles
Retrieves paths to cached files.
getCustomJobName
getSingleCachedFile
Return the first cached file in the list, else null if thre are no cached files.
openStream
readInt

Popular in Java

Finding current android device location
startActivity (Activity)
setRequestProperty (URLConnection)
putExtra (Intent)
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
Socket (java.net)
Provides a client-side TCP socket.
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
JCheckBox (javax.swing)
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Github Copilot alternatives

How to use prepareJobmethodin org.apache.mahout.common.HadoopUtil

Best Java code snippets using org.apache.mahout.common.HadoopUtil.prepareJob (Showing top 20 results out of 315)

How to use
prepareJob
method
in
org.apache.mahout.common.HadoopUtil