/** */ private double[] detectBlockShortText(Map<String, Integer> ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; }
/** */ private double[] detectBlockShortText(Map<String, Integer> ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; }
/** * This is the original algorithm used for all text length. * It is inappropriate for short text. */ private double[] detectBlockLongText(List<String> ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(seed.or(DEFAULT_SEED)); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); for (int i=0; i<ITERATION_LIMIT; i++) { int r = rand.nextInt(ngrams.size()); updateLangProb(prob, ngrams.get(r), 1, alpha); if (i % 5 == 0) { if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL; if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); } return langprob; }
/** * This is the original algorithm used for all text length. * It is inappropriate for short text. */ private double[] detectBlockLongText(List<String> ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(seed.or(DEFAULT_SEED)); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); for (int i=0; i<ITERATION_LIMIT; i++) { int r = rand.nextInt(ngrams.size()); updateLangProb(prob, ngrams.get(r), 1, alpha); if (i % 5 == 0) { if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL; if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); } return langprob; }