public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new CVB0Driver(), args); } }
private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: {}", jobName); Path outputPath = perplexityPath(modelPath.getParent(), iteration); Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class, DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class); job.setJobName(jobName); job.setCombinerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1); setModelPaths(job, modelPath); HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value", String.valueOf(DEFAULT_CONVERGENCE_DELTA)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(NUM_TOPICS, "k", "Number of topics to learn", true); addOption(NUM_TERMS, "nt", "Vocabulary size", false); addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution", String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING)); addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution", String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING)); addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false); addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false); addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false); addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check", String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE)); addOption(RANDOM_SEED, "seed", "Random seed", false); addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing", String.valueOf(DEFAULT_TEST_SET_FRACTION)); addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with", String.valueOf(DEFAULT_NUM_TRAIN_THREADS)); addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with", String.valueOf(DEFAULT_NUM_UPDATE_THREADS)); addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning", String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC)); addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",
setConf(conf); int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations); log.info("Current iteration number: {}", iterationNumber); Path modelPath = modelPath(topicModelStateTempPath, i); double perplexity = readPerplexity(conf, topicModelStateTempPath, i); if (Double.isNaN(perplexity)) { if (!(backfillPerplexity && i % iterationBlockSize == 0)) { perplexity = calculatePerplexity(conf, inputPath, modelPath, i); double delta = rateOfChange(perplexities); if (delta < convergenceDelta) { log.info("Convergence achieved at iteration {} with perplexity {} and delta {}", Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1); Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber); runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations, numReduceTasks); perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber)); log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1)); log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta); Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber); Job topicModelOutputJob = topicModelOutputPath != null
Path topicModelStateTempPath = getTestTempDirPath("topicTemp" + numTestTopics); Configuration conf = getConfiguration(); CVB0Driver cvb0Driver = new CVB0Driver(); cvb0Driver.run(conf, sampleCorpusPath, null, numTestTopics, numTerms, ALPHA, ETA, numIterations, 1, 0, null, null, topicModelStateTempPath, 1234, 0.2f, 2, 1, 3, 1, false);
private static double lowestPerplexity(Configuration conf, Path topicModelTemp) throws IOException { double lowest = Double.MAX_VALUE; double current; int iteration = 2; while (!Double.isNaN(current = CVB0Driver.readPerplexity(conf, topicModelTemp, iteration))) { lowest = Math.min(current, lowest); iteration++; } return lowest; }
setConf(conf); int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations); log.info("Current iteration number: {}", iterationNumber); Path modelPath = modelPath(topicModelStateTempPath, i); double perplexity = readPerplexity(conf, topicModelStateTempPath, i); if (Double.isNaN(perplexity)) { if (!(backfillPerplexity && i % iterationBlockSize == 0)) { perplexity = calculatePerplexity(conf, inputPath, modelPath, i); double delta = rateOfChange(perplexities); if (delta < convergenceDelta) { log.info("Convergence achieved at iteration {} with perplexity {} and delta {}", Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1); Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber); runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations, numReduceTasks); perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber)); log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1)); log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta); Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber); Job topicModelOutputJob = topicModelOutputPath != null
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value", String.valueOf(DEFAULT_CONVERGENCE_DELTA)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(NUM_TOPICS, "k", "Number of topics to learn", true); addOption(NUM_TERMS, "nt", "Vocabulary size", false); addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution", String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING)); addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution", String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING)); addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false); addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false); addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false); addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check", String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE)); addOption(RANDOM_SEED, "seed", "Random seed", false); addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing", String.valueOf(DEFAULT_TEST_SET_FRACTION)); addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with", String.valueOf(DEFAULT_NUM_TRAIN_THREADS)); addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with", String.valueOf(DEFAULT_NUM_UPDATE_THREADS)); addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning", String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC)); addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",
setConf(conf); int iterationNumber = getCurrentIterationNumber(conf, topicModelStateTempPath, maxIterations); log.info("Current iteration number: {}", iterationNumber); Path modelPath = modelPath(topicModelStateTempPath, i); double perplexity = readPerplexity(conf, topicModelStateTempPath, i); if (Double.isNaN(perplexity)) { if (!(backfillPerplexity && i % iterationBlockSize == 0)) { perplexity = calculatePerplexity(conf, inputPath, modelPath, i); double delta = rateOfChange(perplexities); if (delta < convergenceDelta) { log.info("Convergence achieved at iteration {} with perplexity {} and delta {}", Path modelInputPath = modelPath(topicModelStateTempPath, iterationNumber - 1); Path modelOutputPath = modelPath(topicModelStateTempPath, iterationNumber); runIteration(conf, inputPath, modelInputPath, modelOutputPath, iterationNumber, maxIterations, numReduceTasks); perplexities.add(calculatePerplexity(conf, inputPath, modelOutputPath, iterationNumber)); log.info("Current perplexity = {}", perplexities.get(perplexities.size() - 1)); log.info("(p_{} - p_{}) / p_0 = {}; target = {}", iterationNumber, iterationNumber - iterationBlockSize, rateOfChange(perplexities), convergenceDelta); Path finalIterationData = modelPath(topicModelStateTempPath, iterationNumber); Job topicModelOutputJob = topicModelOutputPath != null
private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: {}", jobName); Path outputPath = perplexityPath(modelPath.getParent(), iteration); Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class, DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class); job.setJobName(jobName); job.setCombinerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1); setModelPaths(job, modelPath); HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new CVB0Driver(), args); } }
@Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption(DefaultOptionCreator.maxIterationsOption().create()); addOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION, "cd", "The convergence delta value", String.valueOf(DEFAULT_CONVERGENCE_DELTA)); addOption(DefaultOptionCreator.overwriteOption().create()); addOption(NUM_TOPICS, "k", "Number of topics to learn", true); addOption(NUM_TERMS, "nt", "Vocabulary size", false); addOption(DOC_TOPIC_SMOOTHING, "a", "Smoothing for document/topic distribution", String.valueOf(DEFAULT_DOC_TOPIC_SMOOTHING)); addOption(TERM_TOPIC_SMOOTHING, "e", "Smoothing for topic/term distribution", String.valueOf(DEFAULT_TERM_TOPIC_SMOOTHING)); addOption(DICTIONARY, "dict", "Path to term-dictionary file(s) (glob expression supported)", false); addOption(DOC_TOPIC_OUTPUT, "dt", "Output path for the training doc/topic distribution", false); addOption(MODEL_TEMP_DIR, "mt", "Path to intermediate model path (useful for restarting)", false); addOption(ITERATION_BLOCK_SIZE, "block", "Number of iterations per perplexity check", String.valueOf(DEFAULT_ITERATION_BLOCK_SIZE)); addOption(RANDOM_SEED, "seed", "Random seed", false); addOption(TEST_SET_FRACTION, "tf", "Fraction of data to hold out for testing", String.valueOf(DEFAULT_TEST_SET_FRACTION)); addOption(NUM_TRAIN_THREADS, "ntt", "number of threads per mapper to train with", String.valueOf(DEFAULT_NUM_TRAIN_THREADS)); addOption(NUM_UPDATE_THREADS, "nut", "number of threads per mapper to update the model with", String.valueOf(DEFAULT_NUM_UPDATE_THREADS)); addOption(MAX_ITERATIONS_PER_DOC, "mipd", "max number of iterations per doc for p(topic|doc) learning", String.valueOf(DEFAULT_MAX_ITERATIONS_PER_DOC)); addOption(NUM_REDUCE_TASKS, null, "number of reducers to use during model estimation",
private double calculatePerplexity(Configuration conf, Path corpusPath, Path modelPath, int iteration) throws IOException, ClassNotFoundException, InterruptedException { String jobName = "Calculating perplexity for " + modelPath; log.info("About to run: {}", jobName); Path outputPath = perplexityPath(modelPath.getParent(), iteration); Job job = prepareJob(corpusPath, outputPath, CachingCVB0PerplexityMapper.class, DoubleWritable.class, DoubleWritable.class, DualDoubleSumReducer.class, DoubleWritable.class, DoubleWritable.class); job.setJobName(jobName); job.setCombinerClass(DualDoubleSumReducer.class); job.setNumReduceTasks(1); setModelPaths(job, modelPath); HadoopUtil.delete(conf, outputPath); if (!job.waitForCompletion(true)) { throw new InterruptedException("Failed to calculate perplexity for: " + modelPath); } return readPerplexity(conf, modelPath.getParent(), iteration); }
public static void main(String[] args) throws Exception { ToolRunner.run(new Configuration(), new CVB0Driver(), args); } }