org.apache.mahout.clustering.lda.cvb.CVB0Driver java code examples

 public static void main(String[] args) throws Exception {
  ToolRunner.run(new Configuration(), new CVB0Driver(), args);
 }
}

private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
 throws IOException, ClassNotFoundException, InterruptedException {
 String jobName = "Calculating perplexity for " + modelPath;
 log.info("About to run: {}", jobName);
 Path outputPath = perplexityPath(modelPath.getParent(), iteration);
 Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
   DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);
 job.setJobName(jobName);
 job.setCombinerClass(DualDoubleSumReducer.class);
 job.setNumReduceTasks(1);
 setModelPaths(job, modelPath);
 HadoopUtil.delete(conf, outputPath);
 if (!job.waitForCompletion(true)) {
  throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
 }
 return readPerplexity(conf, modelPath.getParent(), iteration);
}

@Override
public int run(String[] args) throws Exception {
 addInputOption();
 addOutputOption();
 addOption(DefaultOptionCreator.maxIterationsOption().create());
 addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value",
      String.valueOf(DEFAULT_CONVERGENCE_DELTA));
 addOption(DefaultOptionCreator.overwriteOption().create());
 addOption(NUM_TOPICS, "k", "Number of topics to learn", true);
 addOption(NUM_TERMS, "nt", "Vocabulary size", false);
 addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution",
      String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING));
 addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution",
      String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING));
 addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false);
 addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false);
 addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false);
 addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check",
      String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE));
 addOption(RANDOM_SEED, "seed", "Random seed", false);
 addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing",
      String.valueOf(DEFAULT_TEST_SET_FRACTION));
 addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with",
      String.valueOf(DEFAULT_NUM_TRAIN_THREADS));
 addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with",
      String.valueOf(DEFAULT_NUM_UPDATE_THREADS));
 addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning",
      String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC));
 addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",

setConf(conf);
int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
log.info("Current iteration number: {}", iterationNumber);
 Path modelPath = modelPath(topicModelStateTempPath, i);
 double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
 if (Double.isNaN(perplexity)) {
  if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
  perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
  double delta = rateOfChange(perplexities);
  if (delta < convergenceDelta) {
   log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
 Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
 Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
 runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber,
   maxIterations, numReduceTasks);
  perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
  log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
  log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize,
    rateOfChange(perplexities), convergenceDelta);
Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
Job topicModelOutputJob = topicModelOutputPath != null

Path topicModelStateTempPath = getTestTempDirPath("topicTemp" + numTestTopics);
Configuration conf = getConfiguration();
CVB0Driver cvb0Driver = new CVB0Driver();
cvb0Driver.run(conf, sampleCorpusPath, null, numTestTopics, numTerms,
  ALPHA, ETA, numIterations, 1, 0, null, null, topicModelStateTempPath, 1234, 0.2f, 2,
  1, 3, 1, false);

private static double lowestPerplexity(Configuration conf, Path topicModelTemp)
  throws IOException {
 double lowest = Double.MAX_VALUE;
 double current;
 int iteration = 2;
 while (!Double.isNaN(current = CVB0Driver.readPerplexity(conf, topicModelTemp, iteration))) {
  lowest = Math.min(current, lowest);
  iteration++;
 }
 return lowest;
}

setConf(conf);
int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
log.info("Current iteration number: {}", iterationNumber);
 Path modelPath = modelPath(topicModelStateTempPath, i);
 double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
 if (Double.isNaN(perplexity)) {
  if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
  perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
  double delta = rateOfChange(perplexities);
  if (delta < convergenceDelta) {
   log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
 Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
 Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
 runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber,
   maxIterations, numReduceTasks);
  perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
  log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
  log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize,
    rateOfChange(perplexities), convergenceDelta);
Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
Job topicModelOutputJob = topicModelOutputPath != null

@Override
public int run(String[] args) throws Exception {
 addInputOption();
 addOutputOption();
 addOption(DefaultOptionCreator.maxIterationsOption().create());
 addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value",
      String.valueOf(DEFAULT_CONVERGENCE_DELTA));
 addOption(DefaultOptionCreator.overwriteOption().create());
 addOption(NUM_TOPICS, "k", "Number of topics to learn", true);
 addOption(NUM_TERMS, "nt", "Vocabulary size", false);
 addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution",
      String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING));
 addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution",
      String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING));
 addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false);
 addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false);
 addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false);
 addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check",
      String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE));
 addOption(RANDOM_SEED, "seed", "Random seed", false);
 addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing",
      String.valueOf(DEFAULT_TEST_SET_FRACTION));
 addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with",
      String.valueOf(DEFAULT_NUM_TRAIN_THREADS));
 addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with",
      String.valueOf(DEFAULT_NUM_UPDATE_THREADS));
 addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning",
      String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC));
 addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",

setConf(conf);
int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations);
log.info("Current iteration number: {}", iterationNumber);
 Path modelPath = modelPath(topicModelStateTempPath, i);
 double perplexity = readPerplexity(conf, topicModelStateTempPath, i);
 if (Double.isNaN(perplexity)) {
  if (!(backfillPerplexity && i % iterationBlockSize == 0)) {
  perplexity = calculatePerplexity(conf, inputPath, modelPath, i);
  double delta = rateOfChange(perplexities);
  if (delta < convergenceDelta) {
   log.info("Convergence achieved at iteration {} with perplexity {} and delta {}",
 Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1);
 Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber);
 runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber,
   maxIterations, numReduceTasks);
  perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber));
  log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1));
  log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize,
    rateOfChange(perplexities), convergenceDelta);
Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber);
Job topicModelOutputJob = topicModelOutputPath != null

private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
 throws IOException, ClassNotFoundException, InterruptedException {
 String jobName = "Calculating perplexity for " + modelPath;
 log.info("About to run: {}", jobName);
 Path outputPath = perplexityPath(modelPath.getParent(), iteration);
 Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
   DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);
 job.setJobName(jobName);
 job.setCombinerClass(DualDoubleSumReducer.class);
 job.setNumReduceTasks(1);
 setModelPaths(job, modelPath);
 HadoopUtil.delete(conf, outputPath);
 if (!job.waitForCompletion(true)) {
  throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
 }
 return readPerplexity(conf, modelPath.getParent(), iteration);
}

 public static void main(String[] args) throws Exception {
  ToolRunner.run(new Configuration(), new CVB0Driver(), args);
 }
}

@Override
public int run(String[] args) throws Exception {
 addInputOption();
 addOutputOption();
 addOption(DefaultOptionCreator.maxIterationsOption().create());
 addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value",
      String.valueOf(DEFAULT_CONVERGENCE_DELTA));
 addOption(DefaultOptionCreator.overwriteOption().create());
 addOption(NUM_TOPICS, "k", "Number of topics to learn", true);
 addOption(NUM_TERMS, "nt", "Vocabulary size", false);
 addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution",
      String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING));
 addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution",
      String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING));
 addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false);
 addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false);
 addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false);
 addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check",
      String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE));
 addOption(RANDOM_SEED, "seed", "Random seed", false);
 addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing",
      String.valueOf(DEFAULT_TEST_SET_FRACTION));
 addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with",
      String.valueOf(DEFAULT_NUM_TRAIN_THREADS));
 addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with",
      String.valueOf(DEFAULT_NUM_UPDATE_THREADS));
 addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning",
      String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC));
 addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",

private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration)
 throws IOException, ClassNotFoundException, InterruptedException {
 String jobName = "Calculating perplexity for " + modelPath;
 log.info("About to run: {}", jobName);
 Path outputPath = perplexityPath(modelPath.getParent(), iteration);
 Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class,
   DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class);
 job.setJobName(jobName);
 job.setCombinerClass(DualDoubleSumReducer.class);
 job.setNumReduceTasks(1);
 setModelPaths(job, modelPath);
 HadoopUtil.delete(conf, outputPath);
 if (!job.waitForCompletion(true)) {
  throw new InterruptedException("Failed to calculate perplexity for: " + modelPath);
 }
 return readPerplexity(conf, modelPath.getParent(), iteration);
}

 public static void main(String[] args) throws Exception {
  ToolRunner.run(new Configuration(), new CVB0Driver(), args);
 }
}

Javadoc

See CachingCVB0Mapper for more details on scalability and room for improvement. To try out this LDA implementation without using Hadoop, check out InMemoryCollapsedVariationalBayes0. If you want to do training directly in java code with your own main(), then look to ModelTrainer and TopicModel. Usage: ./bin/mahout cvb options

Valid options include: --input path Input path for SequenceFile document vectors. See org.apache.mahout.vectorizer.SparseVectorsFromSequenceFilesfor details on how to generate this input format. --dictionary path Path to dictionary file(s) generated during construction of input document vectors (glob expression supported). If set, this data is scanned to determine an appropriate value for option --num_terms. --output path Output path for topic-term distributions. --doc_topic_output path Output path for doc-topic distributions. --num_topics k Number of latent topics. --num_terms nt Number of unique features defined by input document vectors. If option --dictionaryis defined and this option is unspecified, term count is calculated from dictionary. --topic_model_temp_dir path Path in which to store model state after each iteration. --maxIter i Maximum number of iterations to perform. If this value is less than or equal to the number of iteration states found beneath the path specified by option --topic_model_temp_dir, no further iterations are performed. Instead, output topic-term and doc-topic distributions are generated using data from the specified iteration. --max_doc_topic_iters i Maximum number of iterations per doc for p(topic|doc) learning. Defaults to 10. --doc_topic_smoothing a Smoothing for doc-topic distribution. Defaults to 0.0001. --term_topic_smoothing e Smoothing for topic-term distribution. Defaults to 0.0001. --random_seed seed Integer seed for random number generation. --test_set_percentage p Fraction of data to hold out for testing. Defaults to 0.0. --iteration_block_size block Number of iterations between perplexity checks. Defaults to 10. This option is ignored unless option --test_set_percentage is greater than zero.

Most used methods

Popular in Java

Finding current android device location
requestLocationUpdates (LocationManager)
setRequestProperty (URLConnection)
setScale (BigDecimal)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
TimerTask (java.util)
The TimerTask class represents a task to run at a specified time. The task may be run once or repeat
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Table (org.hibernate.mapping)
A relational table
From CI to AI: The AI layer in your organization

How to useCVB0Driver in org.apache.mahout.clustering.lda.cvb

Best Java code snippets using org.apache.mahout.clustering.lda.cvb.CVB0Driver (Showing top 14 results out of 315)

How to use
CVB0Driver
in
org.apache.mahout.clustering.lda.cvb