com.optimaize.langdetect.cybozu.util java code examples

public TagExtractor(String tag, int threshold) {
  target_ = tag;
  threshold_ = threshold;
  count_ = 0;
  clear();
}
public int count() {

/**
 * Normal Constructor
 * @param name language name
 */
public LangProfile(String name) {
  this.setName(name);
}

public void closeTag(LangProfile profile) {
  if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) {
    Util.addCharSequence(profile, textObjectFactory.forText(buf_));
    ++count_;
  }
  clear();
}

public static LanguageProfile convert(LangProfile langProfile) {
  LdLocale locale;
  try {
    locale = LdLocale.fromString(langProfile.getName());
  } catch (Exception e) {
    throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e);
  }
  LanguageProfileBuilder builder = new LanguageProfileBuilder(locale);
  for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) {
    builder.addGram(entry.getKey(), entry.getValue());
  }
  return builder.build();
}

/**
 * update language probabilities with N-gram string(N=1,2,3)
 * @param count 1-n: how often the gram occurred.
 */
private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) {
  double[] langProbMap = ngramFrequencyData.getProbabilities(ngram);
  if (langProbMap==null) {
    return false;
  }
  if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList()));
  double weight = alpha / BASE_FREQ;
  if (ngram.length() >1) {
    if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
      weight *= prefixFactor;
    } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
      weight *= suffixFactor;
    }
  }
  for (int i=0; i<prob.length; ++i) {
    for (int amount=0; amount<count; amount++) {
      prob[i] *= (weight + langProbMap[i]);
    }
  }
  return true;
}

@Override
public String filter(CharSequence text) {
  StringBuilder ret = new StringBuilder();
  char pre = 0;
  for (int i=0; i<text.length(); i++) {
    char c = CharNormalizer.normalize(text.charAt(i));
    if (c != ' ' || pre != ' ') {
      ret.append(c);
    }
    pre = c;
  }
  return ret.toString();
}

  public static void addCharSequence(LangProfile langProfile, CharSequence text) {
    //TODO replace with new code.

//        List<String> old = OldNgramExtractor.extractNGrams(text, null);
//        List<String> nuu = ngramExtractor.extractGrams(text);
//
//        Set<String> oldSet = new HashSet<>(old);
//        Set<String> nuuSet = new HashSet<>(nuu);
//
//        ArrayList<String> justNuu = new ArrayList<>(nuu);
//        justNuu.removeAll(old);
//
//        ArrayList<String> justOld = new ArrayList<>(old);
//        justOld.removeAll(nuu);
//
//        System.out.println(text);

//        for (String s : ngramExtractor.extractGrams(text)) {
//            langProfile.add(s);
//        }
    for (String s : OldNgramExtractor.extractNGrams(text, null)) {
      langProfile.add(s);
    }
  }

/**
 */
private double[] detectBlockShortText(Map<String, Integer> ngrams) {
  double[] prob = initProbability();
  double alpha = this.alpha; //TODO I don't understand what this does.
  for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) {
    updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
    if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
  }
  Util.normalizeProb(prob);
  if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
  return prob;
}

public void closeTag(LangProfile profile) {
  if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) {
    Util.addCharSequence(profile, textObjectFactory.forText(buf_));
    ++count_;
  }
  clear();
}

public static LanguageProfile convert(LangProfile langProfile) {
  LdLocale locale;
  try {
    locale = LdLocale.fromString(langProfile.getName());
  } catch (Exception e) {
    throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e);
  }
  LanguageProfileBuilder builder = new LanguageProfileBuilder(locale);
  for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) {
    builder.addGram(entry.getKey(), entry.getValue());
  }
  return builder.build();
}

/**
 * update language probabilities with N-gram string(N=1,2,3)
 * @param count 1-n: how often the gram occurred.
 */
private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) {
  double[] langProbMap = ngramFrequencyData.getProbabilities(ngram);
  if (langProbMap==null) {
    return false;
  }
  if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList()));
  double weight = alpha / BASE_FREQ;
  if (ngram.length() >1) {
    if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
      weight *= prefixFactor;
    } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
      weight *= suffixFactor;
    }
  }
  for (int i=0; i<prob.length; ++i) {
    for (int amount=0; amount<count; amount++) {
      prob[i] *= (weight + langProbMap[i]);
    }
  }
  return true;
}

@Override
public String filter(CharSequence text) {
  StringBuilder ret = new StringBuilder();
  char pre = 0;
  for (int i=0; i<text.length(); i++) {
    char c = CharNormalizer.normalize(text.charAt(i));
    if (c != ' ' || pre != ' ') {
      ret.append(c);
    }
    pre = c;
  }
  return ret.toString();
}

public TagExtractor(String tag, int threshold) {
  target_ = tag;
  threshold_ = threshold;
  count_ = 0;
  clear();
}
public int count() {

/**
 * Normal Constructor
 * @param name language name
 */
public LangProfile(String name) {
  this.setName(name);
}

  public static void addCharSequence(LangProfile langProfile, CharSequence text) {
    //TODO replace with new code.

//        List<String> old = OldNgramExtractor.extractNGrams(text, null);
//        List<String> nuu = ngramExtractor.extractGrams(text);
//
//        Set<String> oldSet = new HashSet<>(old);
//        Set<String> nuuSet = new HashSet<>(nuu);
//
//        ArrayList<String> justNuu = new ArrayList<>(nuu);
//        justNuu.removeAll(old);
//
//        ArrayList<String> justOld = new ArrayList<>(old);
//        justOld.removeAll(nuu);
//
//        System.out.println(text);

//        for (String s : ngramExtractor.extractGrams(text)) {
//            langProfile.add(s);
//        }
    for (String s : OldNgramExtractor.extractNGrams(text, null)) {
      langProfile.add(s);
    }
  }

/**
 */
private double[] detectBlockShortText(Map<String, Integer> ngrams) {
  double[] prob = initProbability();
  double alpha = this.alpha; //TODO I don't understand what this does.
  for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) {
    updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
    if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
  }
  Util.normalizeProb(prob);
  if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
  return prob;
}

public void addChar(char ch) {
  ch = CharNormalizer.normalize(ch);
  char lastChar = grams_.charAt(grams_.length() - 1);
  if (lastChar == ' ') {
    grams_ = new StringBuilder(" ");
    capitalword_ = false;
    if (ch==' ') return;
  } else if (grams_.length() >= N_GRAM) {
    grams_.deleteCharAt(0);
  }
  grams_.append(ch);
  if (Character.isUpperCase(ch)){
    if (Character.isUpperCase(lastChar)) capitalword_ = true;
  } else {
    capitalword_ = false;
  }
}

public void addChar(char ch) {
  ch = CharNormalizer.normalize(ch);
  char lastChar = grams_.charAt(grams_.length() - 1);
  if (lastChar == ' ') {
    grams_ = new StringBuilder(" ");
    capitalword_ = false;
    if (ch==' ') return;
  } else if (grams_.length() >= N_GRAM) {
    grams_.deleteCharAt(0);
  }
  grams_.append(ch);
  if (Character.isUpperCase(ch)){
    if (Character.isUpperCase(lastChar)) capitalword_ = true;
  } else {
    capitalword_ = false;
  }
}

/**
 * Append the target text for language detection.
 * If the total size of target text exceeds the limit size ,
 * the rest is cut down.
 *
 * @param text the target text to append
 */
@Override
public TextObject append(CharSequence text) {
  if (maxTextLength>0 && stringBuilder.length()>=maxTextLength) return this;
  text = textFilter.filter(text);
  //unfortunately this code can't be put into a TextFilter because:
  //1) the limit could not be detected early, a lot of work would be done to waste time and memory
  //2) the last character of the existing string builder could not be seen. if it is a space, we don't want
  //   to add yet another space.
  char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1);
  for (int i=0; i<text.length() && (maxTextLength==0 || stringBuilder.length()<maxTextLength); i++) {
    char c = CharNormalizer.normalize(text.charAt(i));
    if (c != ' ' || pre != ' ') {
      stringBuilder.append(c);
    }
    pre = c;
  }
  return this;
}

/**
 * Append the target text for language detection.
 * If the total size of target text exceeds the limit size ,
 * the rest is cut down.
 *
 * @param text the target text to append
 */
@Override
public TextObject append(CharSequence text) {
  if (maxTextLength>0 && stringBuilder.length()>=maxTextLength) return this;
  text = textFilter.filter(text);
  //unfortunately this code can't be put into a TextFilter because:
  //1) the limit could not be detected early, a lot of work would be done to waste time and memory
  //2) the last character of the existing string builder could not be seen. if it is a space, we don't want
  //   to add yet another space.
  char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1);
  for (int i=0; i<text.length() && (maxTextLength==0 || stringBuilder.length()<maxTextLength); i++) {
    char c = CharNormalizer.normalize(text.charAt(i));
    if (c != ' ' || pre != ' ') {
      stringBuilder.append(c);
    }
    pre = c;
  }
  return this;
}

How to use com.optimaize.langdetect.cybozu.util

Best Java code snippets using com.optimaize.langdetect.cybozu.util (Showing top 20 results out of 315)