public TopicModel(Configuration conf, double eta, double alpha, String[] dictionary, int numThreads, double modelWeight, Path... modelpath) throws IOException { this(loadModel(conf, modelpath), eta, alpha, dictionary, numThreads, modelWeight); }
@Override public Double call() { run(); return readModel.perplexity(document, docTopics); } }
public void persist(Path outputPath) throws IOException { readModel.persist(outputPath, true); }
private void initializeModel() { TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms, numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight); topicModel.setConf(getConf()); TopicModel updatedModel = initialModelCorpusFraction == 0 ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1) : topicModel; updatedModel.setConf(getConf()); docTopicCounts = new DenseMatrix(numDocuments, numTopics); docTopicCounts.assign(1.0 / numTopics); modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms); }
public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) { // first calculate p(topic|term,document) for all terms in original, and all topics, // using p(term|topic) and p(topic|doc) pTopicGivenTerm(original, topics, docTopicModel); normalizeByTopic(docTopicModel); // now multiply, term-by-term, by the document, to get the weighted distribution of // term-topic pairs from this document. for (Element e : original.nonZeroes()) { for (int x = 0; x < numTopics; x++) { Vector docTopicModelRow = docTopicModel.viewRow(x); docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get()); } } // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm topics.assign(0.0); for (int x = 0; x < numTopics; x++) { topics.set(x, docTopicModel.viewRow(x).norm(1)); } // now renormalize so that \(sum_x(p(x|doc))\) = 1 topics.assign(Functions.mult(1 / topics.norm(1))); }
public synchronized void reset() { for (int x = 0; x < numTopics; x++) { topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms)); } topicSums.assign(1.0); if (threadPool.isTerminated()) { initializeThreadPool(); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { log.info("Retrieving configuration"); Configuration conf = context.getConfiguration(); float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); log.info("Initializing read model"); Path[] modelPaths = CVB0Driver.getModelPaths(conf); if (modelPaths != null && modelPaths.length > 0) { readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); } else { log.info("No model files found"); readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, numTrainThreads, modelWeight); } log.info("Initializing write model"); writeModel = modelWeight == 1 ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads) : readModel; log.info("Initializing model trainer"); modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms); modelTrainer.start(); }
public void start() { log.info("Starting training threadpool with {} threads", numTrainThreads); workQueue = new ArrayBlockingQueue<Runnable>(numTrainThreads * 10); threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS, workQueue); threadPool.allowCoreThreadTimeOut(false); threadPool.prestartAllCoreThreads(); writeModel.reset(); }
public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random, String[] dictionary, int numThreads, double modelWeight) { this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight); }
private void initializeModel() { TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms, numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight); topicModel.setConf(getConf()); TopicModel updatedModel = initialModelCorpusFraction == 0 ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1) : topicModel; updatedModel.setConf(getConf()); docTopicCounts = new DenseMatrix(numDocuments, numTopics); docTopicCounts.assign(1.0 / numTopics); modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms); }
public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) { // first calculate p(topic|term,document) for all terms in original, and all topics, // using p(term|topic) and p(topic|doc) pTopicGivenTerm(original, topics, docTopicModel); normalizeByTopic(docTopicModel); // now multiply, term-by-term, by the document, to get the weighted distribution of // term-topic pairs from this document. for (Element e : original.nonZeroes()) { for (int x = 0; x < numTopics; x++) { Vector docTopicModelRow = docTopicModel.viewRow(x); docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get()); } } // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm topics.assign(0.0); for (int x = 0; x < numTopics; x++) { topics.set(x, docTopicModel.viewRow(x).norm(1)); } // now renormalize so that \(sum_x(p(x|doc))\) = 1 topics.assign(Functions.mult(1 / topics.norm(1))); }
public synchronized void reset() { for (int x = 0; x < numTopics; x++) { topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms)); } topicSums.assign(1.0); if (threadPool.isTerminated()) { initializeThreadPool(); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { log.info("Retrieving configuration"); Configuration conf = context.getConfiguration(); float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); log.info("Initializing read model"); Path[] modelPaths = CVB0Driver.getModelPaths(conf); if (modelPaths != null && modelPaths.length > 0) { readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); } else { log.info("No model files found"); readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, numTrainThreads, modelWeight); } log.info("Initializing write model"); writeModel = modelWeight == 1 ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads) : readModel; log.info("Initializing model trainer"); modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms); modelTrainer.start(); }
public void start() { log.info("Starting training threadpool with {} threads", numTrainThreads); workQueue = new ArrayBlockingQueue<Runnable>(numTrainThreads * 10); threadPool = new ThreadPoolExecutor(numTrainThreads, numTrainThreads, 0, TimeUnit.SECONDS, workQueue); threadPool.allowCoreThreadTimeOut(false); threadPool.prestartAllCoreThreads(); writeModel.reset(); }
public TopicModel(int numTopics, int numTerms, double eta, double alpha, Random random, String[] dictionary, int numThreads, double modelWeight) { this(randomMatrix(numTopics, numTerms, random), eta, alpha, dictionary, numThreads, modelWeight); }
private void initializeModel() { TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms, numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight); topicModel.setConf(getConf()); TopicModel updatedModel = initialModelCorpusFraction == 0 ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1) : topicModel; updatedModel.setConf(getConf()); docTopicCounts = new DenseMatrix(numDocuments, numTopics); docTopicCounts.assign(1.0 / numTopics); modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms); }
public void trainDocTopicModel(Vector original, Vector topics, Matrix docTopicModel) { // first calculate p(topic|term,document) for all terms in original, and all topics, // using p(term|topic) and p(topic|doc) pTopicGivenTerm(original, topics, docTopicModel); normalizeByTopic(docTopicModel); // now multiply, term-by-term, by the document, to get the weighted distribution of // term-topic pairs from this document. for (Element e : original.nonZeroes()) { for (int x = 0; x < numTopics; x++) { Vector docTopicModelRow = docTopicModel.viewRow(x); docTopicModelRow.setQuick(e.index(), docTopicModelRow.getQuick(e.index()) * e.get()); } } // now recalculate \(p(topic|doc)\) by summing contributions from all of pTopicGivenTerm topics.assign(0.0); for (int x = 0; x < numTopics; x++) { topics.set(x, docTopicModel.viewRow(x).norm(1)); } // now renormalize so that \(sum_x(p(x|doc))\) = 1 topics.assign(Functions.mult(1 / topics.norm(1))); }
@Override public Double call() { run(); return readModel.perplexity(document, docTopics); } }
public synchronized void reset() { for (int x = 0; x < numTopics; x++) { topicTermCounts.assignRow(x, new SequentialAccessSparseVector(numTerms)); } topicSums.assign(1.0); if (threadPool.isTerminated()) { initializeThreadPool(); } }
@Override protected void setup(Context context) throws IOException, InterruptedException { log.info("Retrieving configuration"); Configuration conf = context.getConfiguration(); float eta = conf.getFloat(CVB0Driver.TERM_TOPIC_SMOOTHING, Float.NaN); float alpha = conf.getFloat(CVB0Driver.DOC_TOPIC_SMOOTHING, Float.NaN); long seed = conf.getLong(CVB0Driver.RANDOM_SEED, 1234L); numTopics = conf.getInt(CVB0Driver.NUM_TOPICS, -1); int numTerms = conf.getInt(CVB0Driver.NUM_TERMS, -1); int numUpdateThreads = conf.getInt(CVB0Driver.NUM_UPDATE_THREADS, 1); int numTrainThreads = conf.getInt(CVB0Driver.NUM_TRAIN_THREADS, 4); maxIters = conf.getInt(CVB0Driver.MAX_ITERATIONS_PER_DOC, 10); float modelWeight = conf.getFloat(CVB0Driver.MODEL_WEIGHT, 1.0f); log.info("Initializing read model"); Path[] modelPaths = CVB0Driver.getModelPaths(conf); if (modelPaths != null && modelPaths.length > 0) { readModel = new TopicModel(conf, eta, alpha, null, numUpdateThreads, modelWeight, modelPaths); } else { log.info("No model files found"); readModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(seed), null, numTrainThreads, modelWeight); } log.info("Initializing write model"); writeModel = modelWeight == 1 ? new TopicModel(numTopics, numTerms, eta, alpha, null, numUpdateThreads) : readModel; log.info("Initializing model trainer"); modelTrainer = new ModelTrainer(readModel, writeModel, numTrainThreads, numTopics, numTerms); modelTrainer.start(); }