public static void main(String[] args) throws Exception { ToolRunner.run(new InMemoryCollapsedVariationalBayes0(), args); } }
public double iterateUntilConvergence(double minFractionalErrorChange, int maxIterations, int minIter) { return iterateUntilConvergence(minFractionalErrorChange, maxIterations, minIter, 0); }
conf.set("fs.default.name", dfsNameNode); String[] terms = loadDictionary(dictDirString, conf); logTime("dictionary loading", System.nanoTime() - start); start = System.nanoTime(); Matrix corpus = loadVectors(inputDirString, conf); logTime("vector seqfile corpus loading", System.nanoTime() - start); start = System.nanoTime(); InMemoryCollapsedVariationalBayes0 cvb0 = new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta, numTrainThreads, numUpdateThreads, modelCorpusFraction); logTime("cvb0 init", System.nanoTime() - start); cvb0.setVerbose(verbose); cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); logTime("total training time", System.nanoTime() - start); cvb0.writeModel(new Path(topicOutFile)); DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); logTime("printTopics", System.nanoTime() - start); } catch (OptionException e) { log.error("Error while parsing options", e);
for (int trial = 0; trial < numTrials; trial++) { InMemoryCollapsedVariationalBayes0 cvb = new InMemoryCollapsedVariationalBayes0(sampledCorpus, terms, numTestTopics, ALPHA, ETA, 2, 1, 0); cvb.setVerbose(true); perps[trial] = cvb.iterateUntilConvergence(0, 5, 0, 0.2); System.out.println(perps[trial]);
public InMemoryCollapsedVariationalBayes0(Matrix corpus, String[] terms, int numTopics, double alpha, double eta, int numTrainingThreads, int numUpdatingThreads, double modelCorpusFraction) { //this.seed = seed; this.numTopics = numTopics; this.alpha = alpha; this.eta = eta; //this.minDfCt = 0; //this.maxDfPct = 1.0f; corpusWeights = corpus; numDocuments = corpus.numRows(); this.terms = terms; this.initialModelCorpusFraction = modelCorpusFraction; numTerms = terms != null ? terms.length : corpus.numCols(); Map<String, Integer> termIdMap = Maps.newHashMap(); if (terms != null) { for (int t = 0; t < terms.length; t++) { termIdMap.put(terms[t], t); } } this.numTrainingThreads = numTrainingThreads; this.numUpdatingThreads = numUpdatingThreads; postInitCorpus(); initializeModel(); }
public void trainDocuments(double testFraction) { long start = System.nanoTime(); modelTrainer.start(); for (int docId = 0; docId < corpusWeights.numRows(); docId++) { if (testFraction == 0 || docId % (1 / testFraction) != 0) { Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics); // docTopicCounts.getRow(docId) modelTrainer.trainSync(corpusWeights.viewRow(docId), docTopics , true, 10); } } modelTrainer.stop(); logTime("train documents", System.nanoTime() - start); }
private void initializeModel() { TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms, numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight); topicModel.setConf(getConf()); TopicModel updatedModel = initialModelCorpusFraction == 0 ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1) : topicModel; updatedModel.setConf(getConf()); docTopicCounts = new DenseMatrix(numDocuments, numTopics); docTopicCounts.assign(1.0 / numTopics); modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms); }
conf.set("fs.default.name", dfsNameNode); String[] terms = loadDictionary(dictDirString, conf); logTime("dictionary loading", System.nanoTime() - start); start = System.nanoTime(); Matrix corpus = loadVectors(inputDirString, conf); logTime("vector seqfile corpus loading", System.nanoTime() - start); start = System.nanoTime(); InMemoryCollapsedVariationalBayes0 cvb0 = new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta, numTrainThreads, numUpdateThreads, modelCorpusFraction); logTime("cvb0 init", System.nanoTime() - start); cvb0.setVerbose(verbose); cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); logTime("total training time", System.nanoTime() - start); cvb0.writeModel(new Path(topicOutFile)); DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); logTime("printTopics", System.nanoTime() - start); } catch (OptionException e) { log.error("Error while parsing options", e);
public InMemoryCollapsedVariationalBayes0(Matrix corpus, String[] terms, int numTopics, double alpha, double eta, int numTrainingThreads, int numUpdatingThreads, double modelCorpusFraction) { //this.seed = seed; this.numTopics = numTopics; this.alpha = alpha; this.eta = eta; //this.minDfCt = 0; //this.maxDfPct = 1.0f; corpusWeights = corpus; numDocuments = corpus.numRows(); this.terms = terms; this.initialModelCorpusFraction = modelCorpusFraction; numTerms = terms != null ? terms.length : corpus.numCols(); Map<String, Integer> termIdMap = Maps.newHashMap(); if (terms != null) { for (int t = 0; t < terms.length; t++) { termIdMap.put(terms[t], t); } } this.numTrainingThreads = numTrainingThreads; this.numUpdatingThreads = numUpdatingThreads; postInitCorpus(); initializeModel(); }
public void trainDocuments(double testFraction) { long start = System.nanoTime(); modelTrainer.start(); for (int docId = 0; docId < corpusWeights.numRows(); docId++) { if (testFraction == 0 || docId % (1 / testFraction) != 0) { Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics); // docTopicCounts.getRow(docId) modelTrainer.trainSync(corpusWeights.viewRow(docId), docTopics , true, 10); } } modelTrainer.stop(); logTime("train documents", System.nanoTime() - start); }
private void initializeModel() { TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms, numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight); topicModel.setConf(getConf()); TopicModel updatedModel = initialModelCorpusFraction == 0 ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1) : topicModel; updatedModel.setConf(getConf()); docTopicCounts = new DenseMatrix(numDocuments, numTopics); docTopicCounts.assign(1.0 / numTopics); modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms); }
conf.set("fs.default.name", dfsNameNode); String[] terms = loadDictionary(dictDirString, conf); logTime("dictionary loading", System.nanoTime() - start); start = System.nanoTime(); Matrix corpus = loadVectors(inputDirString, conf); logTime("vector seqfile corpus loading", System.nanoTime() - start); start = System.nanoTime(); InMemoryCollapsedVariationalBayes0 cvb0 = new InMemoryCollapsedVariationalBayes0(corpus, terms, numTopics, alpha, eta, numTrainThreads, numUpdateThreads, modelCorpusFraction); logTime("cvb0 init", System.nanoTime() - start); cvb0.setVerbose(verbose); cvb0.iterateUntilConvergence(minFractionalErrorChange, maxIterations, burnInIterations); logTime("total training time", System.nanoTime() - start); cvb0.writeModel(new Path(topicOutFile)); DistributedRowMatrixWriter.write(new Path(docOutFile), conf, cvb0.docTopicCounts); logTime("printTopics", System.nanoTime() - start); } catch (OptionException e) { log.error("Error while parsing options", e);
public InMemoryCollapsedVariationalBayes0(Matrix corpus, String[] terms, int numTopics, double alpha, double eta, int numTrainingThreads, int numUpdatingThreads, double modelCorpusFraction) { //this.seed = seed; this.numTopics = numTopics; this.alpha = alpha; this.eta = eta; //this.minDfCt = 0; //this.maxDfPct = 1.0f; corpusWeights = corpus; numDocuments = corpus.numRows(); this.terms = terms; this.initialModelCorpusFraction = modelCorpusFraction; numTerms = terms != null ? terms.length : corpus.numCols(); Map<String, Integer> termIdMap = new HashMap<>(); if (terms != null) { for (int t = 0; t < terms.length; t++) { termIdMap.put(terms[t], t); } } this.numTrainingThreads = numTrainingThreads; this.numUpdatingThreads = numUpdatingThreads; postInitCorpus(); initializeModel(); }
public static void main(String[] args) throws Exception { ToolRunner.run(new InMemoryCollapsedVariationalBayes0(), args); } }
public double iterateUntilConvergence(double minFractionalErrorChange, int maxIterations, int minIter) { return iterateUntilConvergence(minFractionalErrorChange, maxIterations, minIter, 0); }
public void trainDocuments(double testFraction) { long start = System.nanoTime(); modelTrainer.start(); for (int docId = 0; docId < corpusWeights.numRows(); docId++) { if (testFraction == 0 || docId % (1 / testFraction) != 0) { Vector docTopics = new DenseVector(numTopics).assign(1.0 / numTopics); // docTopicCounts.getRow(docId) modelTrainer.trainSync(corpusWeights.viewRow(docId), docTopics , true, 10); } } modelTrainer.stop(); logTime("train documents", System.nanoTime() - start); }
private void initializeModel() { TopicModel topicModel = new TopicModel(numTopics, numTerms, eta, alpha, RandomUtils.getRandom(), terms, numUpdatingThreads, initialModelCorpusFraction == 0 ? 1 : initialModelCorpusFraction * totalCorpusWeight); topicModel.setConf(getConf()); TopicModel updatedModel = initialModelCorpusFraction == 0 ? new TopicModel(numTopics, numTerms, eta, alpha, null, terms, numUpdatingThreads, 1) : topicModel; updatedModel.setConf(getConf()); docTopicCounts = new DenseMatrix(numDocuments, numTopics); docTopicCounts.assign(1.0 / numTopics); modelTrainer = new ModelTrainer(topicModel, updatedModel, numTrainingThreads, numTopics, numTerms); }