public TagExtractor(String tag, int threshold) { target_ = tag; threshold_ = threshold; count_ = 0; clear(); } public int count() {
/** * Normal Constructor * @param name language name */ public LangProfile(String name) { this.setName(name); }
public void closeTag(LangProfile profile) { if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) { Util.addCharSequence(profile, textObjectFactory.forText(buf_)); ++count_; } clear(); }
public static LanguageProfile convert(LangProfile langProfile) { LdLocale locale; try { locale = LdLocale.fromString(langProfile.getName()); } catch (Exception e) { throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e); } LanguageProfileBuilder builder = new LanguageProfileBuilder(locale); for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) { builder.addGram(entry.getKey(), entry.getValue()); } return builder.build(); }
/** * update language probabilities with N-gram string(N=1,2,3) * @param count 1-n: how often the gram occurred. */ private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) { double[] langProbMap = ngramFrequencyData.getProbabilities(ngram); if (langProbMap==null) { return false; } if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList())); double weight = alpha / BASE_FREQ; if (ngram.length() >1) { if (prefixFactor !=1.0 && ngram.charAt(0)==' ') { weight *= prefixFactor; } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') { weight *= suffixFactor; } } for (int i=0; i<prob.length; ++i) { for (int amount=0; amount<count; amount++) { prob[i] *= (weight + langProbMap[i]); } } return true; }
@Override public String filter(CharSequence text) { StringBuilder ret = new StringBuilder(); char pre = 0; for (int i=0; i<text.length(); i++) { char c = CharNormalizer.normalize(text.charAt(i)); if (c != ' ' || pre != ' ') { ret.append(c); } pre = c; } return ret.toString(); }
public static void addCharSequence(LangProfile langProfile, CharSequence text) { //TODO replace with new code. // List<String> old = OldNgramExtractor.extractNGrams(text, null); // List<String> nuu = ngramExtractor.extractGrams(text); // // Set<String> oldSet = new HashSet<>(old); // Set<String> nuuSet = new HashSet<>(nuu); // // ArrayList<String> justNuu = new ArrayList<>(nuu); // justNuu.removeAll(old); // // ArrayList<String> justOld = new ArrayList<>(old); // justOld.removeAll(nuu); // // System.out.println(text); // for (String s : ngramExtractor.extractGrams(text)) { // langProfile.add(s); // } for (String s : OldNgramExtractor.extractNGrams(text, null)) { langProfile.add(s); } }
/** */ private double[] detectBlockShortText(Map<String, Integer> ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; }
public void closeTag(LangProfile profile) { if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) { Util.addCharSequence(profile, textObjectFactory.forText(buf_)); ++count_; } clear(); }
public static LanguageProfile convert(LangProfile langProfile) { LdLocale locale; try { locale = LdLocale.fromString(langProfile.getName()); } catch (Exception e) { throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e); } LanguageProfileBuilder builder = new LanguageProfileBuilder(locale); for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) { builder.addGram(entry.getKey(), entry.getValue()); } return builder.build(); }
/** * update language probabilities with N-gram string(N=1,2,3) * @param count 1-n: how often the gram occurred. */ private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) { double[] langProbMap = ngramFrequencyData.getProbabilities(ngram); if (langProbMap==null) { return false; } if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList())); double weight = alpha / BASE_FREQ; if (ngram.length() >1) { if (prefixFactor !=1.0 && ngram.charAt(0)==' ') { weight *= prefixFactor; } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') { weight *= suffixFactor; } } for (int i=0; i<prob.length; ++i) { for (int amount=0; amount<count; amount++) { prob[i] *= (weight + langProbMap[i]); } } return true; }
@Override public String filter(CharSequence text) { StringBuilder ret = new StringBuilder(); char pre = 0; for (int i=0; i<text.length(); i++) { char c = CharNormalizer.normalize(text.charAt(i)); if (c != ' ' || pre != ' ') { ret.append(c); } pre = c; } return ret.toString(); }
public TagExtractor(String tag, int threshold) { target_ = tag; threshold_ = threshold; count_ = 0; clear(); } public int count() {
/** * Normal Constructor * @param name language name */ public LangProfile(String name) { this.setName(name); }
public static void addCharSequence(LangProfile langProfile, CharSequence text) { //TODO replace with new code. // List<String> old = OldNgramExtractor.extractNGrams(text, null); // List<String> nuu = ngramExtractor.extractGrams(text); // // Set<String> oldSet = new HashSet<>(old); // Set<String> nuuSet = new HashSet<>(nuu); // // ArrayList<String> justNuu = new ArrayList<>(nuu); // justNuu.removeAll(old); // // ArrayList<String> justOld = new ArrayList<>(old); // justOld.removeAll(nuu); // // System.out.println(text); // for (String s : ngramExtractor.extractGrams(text)) { // langProfile.add(s); // } for (String s : OldNgramExtractor.extractNGrams(text, null)) { langProfile.add(s); } }
/** */ private double[] detectBlockShortText(Map<String, Integer> ngrams) { double[] prob = initProbability(); double alpha = this.alpha; //TODO I don't understand what this does. for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) { updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha); if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0 } Util.normalizeProb(prob); if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob)); return prob; }
public void addChar(char ch) { ch = CharNormalizer.normalize(ch); char lastChar = grams_.charAt(grams_.length() - 1); if (lastChar == ' ') { grams_ = new StringBuilder(" "); capitalword_ = false; if (ch==' ') return; } else if (grams_.length() >= N_GRAM) { grams_.deleteCharAt(0); } grams_.append(ch); if (Character.isUpperCase(ch)){ if (Character.isUpperCase(lastChar)) capitalword_ = true; } else { capitalword_ = false; } }
public void addChar(char ch) { ch = CharNormalizer.normalize(ch); char lastChar = grams_.charAt(grams_.length() - 1); if (lastChar == ' ') { grams_ = new StringBuilder(" "); capitalword_ = false; if (ch==' ') return; } else if (grams_.length() >= N_GRAM) { grams_.deleteCharAt(0); } grams_.append(ch); if (Character.isUpperCase(ch)){ if (Character.isUpperCase(lastChar)) capitalword_ = true; } else { capitalword_ = false; } }
/** * Append the target text for language detection. * If the total size of target text exceeds the limit size , * the rest is cut down. * * @param text the target text to append */ @Override public TextObject append(CharSequence text) { if (maxTextLength>0 && stringBuilder.length()>=maxTextLength) return this; text = textFilter.filter(text); //unfortunately this code can't be put into a TextFilter because: //1) the limit could not be detected early, a lot of work would be done to waste time and memory //2) the last character of the existing string builder could not be seen. if it is a space, we don't want // to add yet another space. char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1); for (int i=0; i<text.length() && (maxTextLength==0 || stringBuilder.length()<maxTextLength); i++) { char c = CharNormalizer.normalize(text.charAt(i)); if (c != ' ' || pre != ' ') { stringBuilder.append(c); } pre = c; } return this; }
/** * Append the target text for language detection. * If the total size of target text exceeds the limit size , * the rest is cut down. * * @param text the target text to append */ @Override public TextObject append(CharSequence text) { if (maxTextLength>0 && stringBuilder.length()>=maxTextLength) return this; text = textFilter.filter(text); //unfortunately this code can't be put into a TextFilter because: //1) the limit could not be detected early, a lot of work would be done to waste time and memory //2) the last character of the existing string builder could not be seen. if it is a space, we don't want // to add yet another space. char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1); for (int i=0; i<text.length() && (maxTextLength==0 || stringBuilder.length()<maxTextLength); i++) { char c = CharNormalizer.normalize(text.charAt(i)); if (c != ' ' || pre != ' ') { stringBuilder.append(c); } pre = c; } return this; }