org.apache.mahout.clustering.lda.cvb.TopicModel java code examples

public TopicModel(Configuration conf, double eta, double alpha,
  String[] dictionary, int numThreads, double modelWeight, Path... modelpath) throws IOException {
 this(loadModel(conf, modelpath), eta, alpha, dictionary, numThreads, modelWeight);
}

 @Override
 public Double call() {
  run();
  return readModel.perplexity(document, docTopics);
 }
}

public void persist(Path outputPath) throws IOException {
 readModel.persist(outputPath, true);
}

private void initializeModel() {
 TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms,
   numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight);
 topicModel.setConf(getConf());
 TopicModel updatedModel = initialModelCorpusFraction == 0
   ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1)
   : topicModel;
 updatedModel.setConf(getConf());
 docTopicCounts = new DenseMatrix(numDocuments, numTopics);
 docTopicCounts.assign(1.0 / numTopics);
 modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms);
}

public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) {
 // first calculate p(topic|term,document) for all terms in original, and all topics,
 // using p(term|topic) and p(topic|doc)
 pTopicGivenTerm(original, topics, docTopicModel);
 normalizeByTopic(docTopicModel);
 // now multiply, term-by-term, by the document, to get the weighted distribution of
 // term-topic pairs from this document.
 for (Element e : original.nonZeroes()) {
  for (int x = 0; x < numTopics; x++) {
   Vector docTopicModelRow = docTopicModel.viewRow(x);
   docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get());
  }
 }
 // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm
 topics.assign(0.0);
 for (int x = 0; x < numTopics; x++) {
  topics.set(x, docTopicModel.viewRow(x).norm(1));
 }
 // now renormalize so that \(sum_x(p(x|doc))\) = 1
 topics.assign(Functions.mult(1 / topics.norm(1)));
}

public synchronized void reset() {
 for (int x = 0; x < numTopics; x++) {
  topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms));
 }
 topicSums.assign(1.0);
 if (threadPool.isTerminated()) {
  initializeThreadPool();
 }
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 log.info("Retrieving configuration");
 Configuration conf = context.getConfiguration();
 float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
 float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
 long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
 numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
 int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
 int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
 int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
 maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
 float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
 log.info("Initializing read model");
 Path[] modelPaths = CVB0Driver.getModelPaths(conf);
 if (modelPaths != null && modelPaths.length > 0) {
  readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
 } else {
  log.info("No model files found");
  readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
    numTrainThreads, modelWeight);
 }
 log.info("Initializing write model");
 writeModel = modelWeight == 1
   ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads)
   : readModel;
 log.info("Initializing model trainer");
 modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms);
 modelTrainer.start();
}

public void start() {
 log.info("Starting training threadpool with {} threads", numTrainThreads);
 workQueue = new ArrayBlockingQueue<Runnable>(numTrainThreads * 10);
 threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS,
   workQueue);
 threadPool.allowCoreThreadTimeOut(false);
 threadPool.prestartAllCoreThreads();
 writeModel.reset();
}

public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random,
  String[] dictionary, int numThreads, double modelWeight) {
 this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight);
}

private void initializeModel() {
 TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms,
   numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight);
 topicModel.setConf(getConf());
 TopicModel updatedModel = initialModelCorpusFraction == 0
   ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1)
   : topicModel;
 updatedModel.setConf(getConf());
 docTopicCounts = new DenseMatrix(numDocuments, numTopics);
 docTopicCounts.assign(1.0 / numTopics);
 modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms);
}

public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) {
 // first calculate p(topic|term,document) for all terms in original, and all topics,
 // using p(term|topic) and p(topic|doc)
 pTopicGivenTerm(original, topics, docTopicModel);
 normalizeByTopic(docTopicModel);
 // now multiply, term-by-term, by the document, to get the weighted distribution of
 // term-topic pairs from this document.
 for (Element e : original.nonZeroes()) {
  for (int x = 0; x < numTopics; x++) {
   Vector docTopicModelRow = docTopicModel.viewRow(x);
   docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get());
  }
 }
 // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm
 topics.assign(0.0);
 for (int x = 0; x < numTopics; x++) {
  topics.set(x, docTopicModel.viewRow(x).norm(1));
 }
 // now renormalize so that \(sum_x(p(x|doc))\) = 1
 topics.assign(Functions.mult(1 / topics.norm(1)));
}

public synchronized void reset() {
 for (int x = 0; x < numTopics; x++) {
  topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms));
 }
 topicSums.assign(1.0);
 if (threadPool.isTerminated()) {
  initializeThreadPool();
 }
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 log.info("Retrieving configuration");
 Configuration conf = context.getConfiguration();
 float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
 float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
 long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
 numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
 int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
 int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
 int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
 maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
 float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
 log.info("Initializing read model");
 Path[] modelPaths = CVB0Driver.getModelPaths(conf);
 if (modelPaths != null && modelPaths.length > 0) {
  readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
 } else {
  log.info("No model files found");
  readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
    numTrainThreads, modelWeight);
 }
 log.info("Initializing write model");
 writeModel = modelWeight == 1
   ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads)
   : readModel;
 log.info("Initializing model trainer");
 modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms);
 modelTrainer.start();
}

public void start() {
 log.info("Starting training threadpool with {} threads", numTrainThreads);
 workQueue = new ArrayBlockingQueue<Runnable>(numTrainThreads * 10);
 threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS,
   workQueue);
 threadPool.allowCoreThreadTimeOut(false);
 threadPool.prestartAllCoreThreads();
 writeModel.reset();
}

public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random,
  String[] dictionary, int numThreads, double modelWeight) {
 this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight);
}

private void initializeModel() {
 TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms,
   numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight);
 topicModel.setConf(getConf());
 TopicModel updatedModel = initialModelCorpusFraction == 0
   ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1)
   : topicModel;
 updatedModel.setConf(getConf());
 docTopicCounts = new DenseMatrix(numDocuments, numTopics);
 docTopicCounts.assign(1.0 / numTopics);
 modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms);
}

public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) {
 // first calculate p(topic|term,document) for all terms in original, and all topics,
 // using p(term|topic) and p(topic|doc)
 pTopicGivenTerm(original, topics, docTopicModel);
 normalizeByTopic(docTopicModel);
 // now multiply, term-by-term, by the document, to get the weighted distribution of
 // term-topic pairs from this document.
 for (Element e : original.nonZeroes()) {
  for (int x = 0; x < numTopics; x++) {
   Vector docTopicModelRow = docTopicModel.viewRow(x);
   docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get());
  }
 }
 // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm
 topics.assign(0.0);
 for (int x = 0; x < numTopics; x++) {
  topics.set(x, docTopicModel.viewRow(x).norm(1));
 }
 // now renormalize so that \(sum_x(p(x|doc))\) = 1
 topics.assign(Functions.mult(1 / topics.norm(1)));
}

 @Override
 public Double call() {
  run();
  return readModel.perplexity(document, docTopics);
 }
}

public synchronized void reset() {
 for (int x = 0; x < numTopics; x++) {
  topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms));
 }
 topicSums.assign(1.0);
 if (threadPool.isTerminated()) {
  initializeThreadPool();
 }
}

@Override
protected void setup(Context context) throws IOException, InterruptedException {
 log.info("Retrieving configuration");
 Configuration conf = context.getConfiguration();
 float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN);
 float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN);
 long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L);
 numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1);
 int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1);
 int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1);
 int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4);
 maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10);
 float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f);
 log.info("Initializing read model");
 Path[] modelPaths = CVB0Driver.getModelPaths(conf);
 if (modelPaths != null && modelPaths.length > 0) {
  readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths);
 } else {
  log.info("No model files found");
  readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null,
    numTrainThreads, modelWeight);
 }
 log.info("Initializing write model");
 writeModel = modelWeight == 1
   ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads)
   : readModel;
 log.info("Initializing model trainer");
 modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms);
 modelTrainer.start();
}

Javadoc

Thin wrapper around a Matrix of counts of occurrences of (topic, term) pairs. Dividing {code topicTermCount.viewRow(topic).get(term)} by the sum over the values for all terms in that row yields p(term | topic). Instead dividing it by all topic columns for that term yields p(topic | term). Multithreading is enabled for the update(Matrix) method: this method is async, and merely submits the matrix to a work queue. When all work has been submitted, awaitTermination() should be called, which will block until updates have been accumulated.

Most used methods

<init>
initializeThreadPool
loadModel
normalizeByTopic
pTopicGivenTerm
Computes \(p(topic x | term a, document i)\) distributions given input document i. \(pTGT[x][a]\) is
perplexity
\(sum_x sum_a (c_ai * log(p(x|i) * p(a|x)))\)
persist
randomMatrix
reset
setConf
stop
trainDocTopicModel

Popular in Java

Making http post requests using okhttp
setScale (BigDecimal)
getSupportFragmentManager (FragmentActivity)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Notification (javax.management)
BoxLayout (javax.swing)
Top plugins for Android Studio

How to useTopicModel in org.apache.mahout.clustering.lda.cvb

Best Java code snippets using org.apache.mahout.clustering.lda.cvb.TopicModel (Showing top 20 results out of 315)

How to use
TopicModel
in
org.apache.mahout.clustering.lda.cvb