/** * Use the {@link LanguageDetectorBuilder}. */ LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData, double alpha, Optional<Long> seed, int shortTextAlgorithm, double prefixFactor, double suffixFactor, double probabilityThreshold, double minimalConfidence, @Nullable Map<LdLocale, Double> langWeightingMap, @NotNull NgramExtractor ngramExtractor) { if (alpha<0d || alpha >1d) throw new IllegalArgumentException("alpha must be between 0 and 1, but was: "+alpha); if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException("prefixFactor must be between 0 and 10, but was: "+prefixFactor); if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException("suffixFactor must be between 0 and 10, but was: "+suffixFactor); if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException("probabilityThreshold must be between 0 and 1, but was: "+probabilityThreshold); if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException("minimalConfidence must be between 0 and 1, but was: "+minimalConfidence); if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null; this.ngramFrequencyData = ngramFrequencyData; this.alpha = alpha; this.seed = seed; this.shortTextAlgorithm = shortTextAlgorithm; this.prefixFactor = prefixFactor; this.suffixFactor = suffixFactor; this.probabilityThreshold = probabilityThreshold; this.minimalConfidence = minimalConfidence; this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList()); this.ngramExtractor = ngramExtractor; }
/** */ private double[] detectBlockShortText(Map<String, Integer> ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; }
/** * update language probabilities with N-gram string(N=1,2,3) * @param count 1-n: how often the gram occurred. */ private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) { double[] langProbMap = ngramFrequencyData.getProbabilities(ngram); if (langProbMap==null) { return false; } if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList())); double weight = alpha / BASE_FREQ; if (ngram.length() >1) { if (prefixFactor !=1.0 && ngram.charAt(0)==' ') { weight *= prefixFactor; } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') { weight *= suffixFactor; } } for (int i=0; i<prob.length; ++i) { for (int amount=0; amount<count; amount++) { prob[i] *= (weight + langProbMap[i]); } } return true; }
public void closeTag(LangProfile profile) { if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) { Util.addCharSequence(profile, textObjectFactory.forText(buf_)); ++count_; } clear(); }
/** * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8. * @param lang target language name. * @param textFile input text file. * @return Language profile instance */ public static LangProfile generate(String lang, File textFile) { LangProfile profile = new LangProfile(lang); InputStream is = null; try { is = new BufferedInputStream(new FileInputStream(textFile)); if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is); BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); String line; while ((line = reader.readLine()) != null) { TextObject textObject = textObjectFactory.forText(" "+line+" "); Util.addCharSequence(profile, textObject); } } catch (IOException e) { throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e); } finally { IOUtils.closeQuietly(is); } return profile; } }
/** * update language probabilities with N-gram string(N=1,2,3) * @param count 1-n: how often the gram occurred. */ private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) { double[] langProbMap = ngramFrequencyData.getProbabilities(ngram); if (langProbMap==null) { return false; } if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList())); double weight = alpha / BASE_FREQ; if (ngram.length() >1) { if (prefixFactor !=1.0 && ngram.charAt(0)==' ') { weight *= prefixFactor; } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') { weight *= suffixFactor; } } for (int i=0; i<prob.length; ++i) { for (int amount=0; amount<count; amount++) { prob[i] *= (weight + langProbMap[i]); } } return true; }
/** * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8. * @param lang target language name. * @param textFile input text file. * @return Language profile instance */ public static LangProfile generate(String lang, File textFile) { LangProfile profile = new LangProfile(lang); InputStream is = null; try { is = new BufferedInputStream(new FileInputStream(textFile)); if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is); BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); String line; while ((line = reader.readLine()) != null) { TextObject textObject = textObjectFactory.forText(" "+line+" "); Util.addCharSequence(profile, textObject); } } catch (IOException e) { throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e); } finally { IOUtils.closeQuietly(is); } return profile; } }
/** */ private double[] detectBlockShortText(Map<String, Integer> ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; }
/** * Use the {@link LanguageDetectorBuilder}. */ LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData, double alpha, Optional<Long> seed, int shortTextAlgorithm, double prefixFactor, double suffixFactor, double probabilityThreshold, double minimalConfidence, @Nullable Map<LdLocale, Double> langWeightingMap, @NotNull NgramExtractor ngramExtractor) { if (alpha<0d || alpha >1d) throw new IllegalArgumentException("alpha must be between 0 and 1, but was: "+alpha); if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException("prefixFactor must be between 0 and 10, but was: "+prefixFactor); if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException("suffixFactor must be between 0 and 10, but was: "+suffixFactor); if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException("probabilityThreshold must be between 0 and 1, but was: "+probabilityThreshold); if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException("minimalConfidence must be between 0 and 1, but was: "+minimalConfidence); if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null; this.ngramFrequencyData = ngramFrequencyData; this.alpha = alpha; this.seed = seed; this.shortTextAlgorithm = shortTextAlgorithm; this.prefixFactor = prefixFactor; this.suffixFactor = suffixFactor; this.probabilityThreshold = probabilityThreshold; this.minimalConfidence = minimalConfidence; this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList()); this.ngramExtractor = ngramExtractor; }
public void closeTag(LangProfile profile) { if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) { Util.addCharSequence(profile, textObjectFactory.forText(buf_)); ++count_; } clear(); }
/** * This is the original algorithm used for all text length. * It is inappropriate for short text. */ private double[] detectBlockLongText(List<String> ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(seed.or(DEFAULT_SEED)); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); for (int i=0; i<ITERATION_LIMIT; i++) { int r = rand.nextInt(ngrams.size()); updateLangProb(prob, ngrams.get(r), 1, alpha); if (i % 5 == 0) { if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL; if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); } return langprob; }
/** * This is the original algorithm used for all text length. * It is inappropriate for short text. */ private double[] detectBlockLongText(List<String> ngrams) { assert !ngrams.isEmpty(); double[] langprob = new double[ngramFrequencyData.getLanguageList().size()]; Random rand = new Random(seed.or(DEFAULT_SEED)); for (int t = 0; t < N_TRIAL; ++t) { double[] prob = initProbability(); double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH); for (int i=0; i<ITERATION_LIMIT; i++) { int r = rand.nextInt(ngrams.size()); updateLangProb(prob, ngrams.get(r), 1, alpha); if (i % 5 == 0) { if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob)); } } for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL; if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); } return langprob; }