com.optimaize.langdetect.cybozu.util.Util java code examples

/**
 * Use the {@link LanguageDetectorBuilder}.
 */
LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData,
           double alpha, Optional<Long> seed, int shortTextAlgorithm,
           double prefixFactor, double suffixFactor,
           double probabilityThreshold,
           double minimalConfidence,
           @Nullable Map<LdLocale, Double> langWeightingMap,
           @NotNull NgramExtractor ngramExtractor) {
  if (alpha<0d || alpha >1d) throw new IllegalArgumentException("alpha must be between 0 and 1, but was: "+alpha);
  if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException("prefixFactor must be between 0 and 10, but was: "+prefixFactor);
  if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException("suffixFactor must be between 0 and 10, but was: "+suffixFactor);
  if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException("probabilityThreshold must be between 0 and 1, but was: "+probabilityThreshold);
  if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException("minimalConfidence must be between 0 and 1, but was: "+minimalConfidence);
  if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null;
  this.ngramFrequencyData = ngramFrequencyData;
  this.alpha = alpha;
  this.seed = seed;
  this.shortTextAlgorithm = shortTextAlgorithm;
  this.prefixFactor = prefixFactor;
  this.suffixFactor = suffixFactor;
  this.probabilityThreshold = probabilityThreshold;
  this.minimalConfidence = minimalConfidence;
  this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList());
  this.ngramExtractor = ngramExtractor;
}

/**
 */
private double[] detectBlockShortText(Map<String, Integer> ngrams) {
  double[] prob = initProbability();
  double alpha = this.alpha; //TODO I don't understand what this does.
  for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) {
    updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
    if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
  }
  Util.normalizeProb(prob);
  if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
  return prob;
}

/**
 * update language probabilities with N-gram string(N=1,2,3)
 * @param count 1-n: how often the gram occurred.
 */
private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) {
  double[] langProbMap = ngramFrequencyData.getProbabilities(ngram);
  if (langProbMap==null) {
    return false;
  }
  if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList()));
  double weight = alpha / BASE_FREQ;
  if (ngram.length() >1) {
    if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
      weight *= prefixFactor;
    } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
      weight *= suffixFactor;
    }
  }
  for (int i=0; i<prob.length; ++i) {
    for (int amount=0; amount<count; amount++) {
      prob[i] *= (weight + langProbMap[i]);
    }
  }
  return true;
}

public void closeTag(LangProfile profile) {
  if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) {
    Util.addCharSequence(profile, textObjectFactory.forText(buf_));
    ++count_;
  }
  clear();
}

  /**
   * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8.
   * @param lang target language name.
   * @param textFile input text file.
   * @return Language profile instance
   */
  public static LangProfile generate(String lang, File textFile) {
    LangProfile profile = new LangProfile(lang);

    InputStream is = null;
    try {
      is = new BufferedInputStream(new FileInputStream(textFile));
      if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is);

      BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
      String line;
      while ((line = reader.readLine()) != null) {
        TextObject textObject = textObjectFactory.forText(" "+line+" ");
        Util.addCharSequence(profile, textObject);
      }
    } catch (IOException e) {
      throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e);
    } finally {
      IOUtils.closeQuietly(is);
    }
    return profile;
  }
}

/**
 * update language probabilities with N-gram string(N=1,2,3)
 * @param count 1-n: how often the gram occurred.
 */
private boolean updateLangProb(@NotNull double[] prob, @NotNull String ngram, int count, double alpha) {
  double[] langProbMap = ngramFrequencyData.getProbabilities(ngram);
  if (langProbMap==null) {
    return false;
  }
  if (logger.isTraceEnabled()) logger.trace(ngram + "(" + Util.unicodeEncode(ngram) + "):" + Util.wordProbToString(langProbMap, ngramFrequencyData.getLanguageList()));
  double weight = alpha / BASE_FREQ;
  if (ngram.length() >1) {
    if (prefixFactor !=1.0 && ngram.charAt(0)==' ') {
      weight *= prefixFactor;
    } else if (suffixFactor!=1.0 && ngram.charAt(ngram.length()-1)==' ') {
      weight *= suffixFactor;
    }
  }
  for (int i=0; i<prob.length; ++i) {
    for (int amount=0; amount<count; amount++) {
      prob[i] *= (weight + langProbMap[i]);
    }
  }
  return true;
}

  /**
   * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8.
   * @param lang target language name.
   * @param textFile input text file.
   * @return Language profile instance
   */
  public static LangProfile generate(String lang, File textFile) {
    LangProfile profile = new LangProfile(lang);

    InputStream is = null;
    try {
      is = new BufferedInputStream(new FileInputStream(textFile));
      if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is);

      BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
      String line;
      while ((line = reader.readLine()) != null) {
        TextObject textObject = textObjectFactory.forText(" "+line+" ");
        Util.addCharSequence(profile, textObject);
      }
    } catch (IOException e) {
      throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e);
    } finally {
      IOUtils.closeQuietly(is);
    }
    return profile;
  }
}

/**
 */
private double[] detectBlockShortText(Map<String, Integer> ngrams) {
  double[] prob = initProbability();
  double alpha = this.alpha; //TODO I don't understand what this does.
  for (Map.Entry<String, Integer> gramWithCount : ngrams.entrySet()) {
    updateLangProb(prob, gramWithCount.getKey(), gramWithCount.getValue(), alpha);
    if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
  }
  Util.normalizeProb(prob);
  if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
  return prob;
}

/**
 * Use the {@link LanguageDetectorBuilder}.
 */
LanguageDetectorImpl(@NotNull NgramFrequencyData ngramFrequencyData,
           double alpha, Optional<Long> seed, int shortTextAlgorithm,
           double prefixFactor, double suffixFactor,
           double probabilityThreshold,
           double minimalConfidence,
           @Nullable Map<LdLocale, Double> langWeightingMap,
           @NotNull NgramExtractor ngramExtractor) {
  if (alpha<0d || alpha >1d) throw new IllegalArgumentException("alpha must be between 0 and 1, but was: "+alpha);
  if (prefixFactor <0d || prefixFactor >10d) throw new IllegalArgumentException("prefixFactor must be between 0 and 10, but was: "+prefixFactor);
  if (suffixFactor <0d || suffixFactor >10d) throw new IllegalArgumentException("suffixFactor must be between 0 and 10, but was: "+suffixFactor);
  if (probabilityThreshold<0d || probabilityThreshold>1d) throw new IllegalArgumentException("probabilityThreshold must be between 0 and 1, but was: "+probabilityThreshold);
  if (minimalConfidence<0d || minimalConfidence>1d) throw new IllegalArgumentException("minimalConfidence must be between 0 and 1, but was: "+minimalConfidence);
  if (langWeightingMap!=null && langWeightingMap.isEmpty()) langWeightingMap = null;
  this.ngramFrequencyData = ngramFrequencyData;
  this.alpha = alpha;
  this.seed = seed;
  this.shortTextAlgorithm = shortTextAlgorithm;
  this.prefixFactor = prefixFactor;
  this.suffixFactor = suffixFactor;
  this.probabilityThreshold = probabilityThreshold;
  this.minimalConfidence = minimalConfidence;
  this.priorMap = (langWeightingMap==null) ? null : Util.makeInternalPrioMap(langWeightingMap, ngramFrequencyData.getLanguageList());
  this.ngramExtractor = ngramExtractor;
}

public void closeTag(LangProfile profile) {
  if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) {
    Util.addCharSequence(profile, textObjectFactory.forText(buf_));
    ++count_;
  }
  clear();
}

/**
 * This is the original algorithm used for all text length.
 * It is inappropriate for short text.
 */
private double[] detectBlockLongText(List<String> ngrams) {
  assert !ngrams.isEmpty();
  double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
  Random rand = new Random(seed.or(DEFAULT_SEED));
  for (int t = 0; t < N_TRIAL; ++t) {
    double[] prob = initProbability();
    double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);
    for (int i=0; i<ITERATION_LIMIT; i++) {
      int r = rand.nextInt(ngrams.size());
      updateLangProb(prob, ngrams.get(r), 1, alpha);
      if (i % 5 == 0) {
        if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
        if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
      }
    }
    for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL;
    if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
  }
  return langprob;
}

/**
 * This is the original algorithm used for all text length.
 * It is inappropriate for short text.
 */
private double[] detectBlockLongText(List<String> ngrams) {
  assert !ngrams.isEmpty();
  double[] langprob = new double[ngramFrequencyData.getLanguageList().size()];
  Random rand = new Random(seed.or(DEFAULT_SEED));
  for (int t = 0; t < N_TRIAL; ++t) {
    double[] prob = initProbability();
    double alpha = this.alpha + (rand.nextGaussian() * ALPHA_WIDTH);
    for (int i=0; i<ITERATION_LIMIT; i++) {
      int r = rand.nextInt(ngrams.size());
      updateLangProb(prob, ngrams.get(r), 1, alpha);
      if (i % 5 == 0) {
        if (Util.normalizeProb(prob) > CONV_THRESHOLD) break; //this break ensures that we quit the loop before all probabilities reach 0
        if (logger.isTraceEnabled()) logger.trace("> " + sortProbability(prob));
      }
    }
    for(int j=0;j<langprob.length;++j) langprob[j] += prob[j] / N_TRIAL;
    if (logger.isDebugEnabled()) logger.debug("==> " + sortProbability(prob));
  }
  return langprob;
}

Javadoc

A place for sharing code.

Most used methods

addCharSequence
makeInternalPrioMap
normalizeProb
normalize probabilities and check convergence by the maximum probability
unicodeEncode
unicode encoding (for verbose mode)
wordProbToString

Popular in Java

Reading from database using SQL prepared statement
runOnUiThread (Activity)
scheduleAtFixedRate (Timer)
scheduleAtFixedRate (ScheduledExecutorService)
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
Container (java.awt)
A generic Abstract Window Toolkit(AWT) container object is a component that can contain other AWT co
Best IntelliJ plugins

How to useUtil in com.optimaize.langdetect.cybozu.util

Best Java code snippets using com.optimaize.langdetect.cybozu.util.Util (Showing top 12 results out of 315)

How to use
Util
in
com.optimaize.langdetect.cybozu.util