com.optimaize.langdetect.ngram java code examples

/**
 * @param maxLength the maximum number of characters that will be considered - can help
 *                  with performance. Don't use values below 100, as this would decrease
 *                  accuracy.
 * @throws IllegalArgumentException if {@code maxLength} is less than 10
 * @since 4.2
 */
public LanguageIdentifier(int maxLength) {
 if (maxLength < 10) {
  throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength);
 }
 this.maxLength = maxLength;
 try {
  List<LanguageProfile> profiles = loadProfiles(getLanguageCodes());
  languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
   .minimalConfidence(MINIMAL_CONFIDENCE)
   .shortTextAlgorithm(SHORT_ALGO_THRESHOLD)
   .withProfiles(profiles)
   .build();
  textObjectFactory = new TextObjectFactoryBuilder()
   .maxTextLength(10000)
   .withTextFilter(UrlTextFilter.getInstance())
   .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3))
   .withTextFilter(new RemoveEMailSignatureFilter())
   .build();
 } catch (IOException e) {
  throw new RuntimeException("Could not set up language identifier", e);
 }
}

/**
 * To ensure having border grams, this character is added to the left and right of the text.
 *
 * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f"
 * are created.</p>
 *
 * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p>
 *
 * @param textPadding for example a space ' '.
 */
public NgramExtractor textPadding(char textPadding) {
  return new NgramExtractor(this.gramLengths, this.filter, textPadding);
}

private void _extractCounted(CharSequence text, int gramLength, int len, Map<String, Integer> grams) {
  int endPos = len - (gramLength -1);
  for (int pos=0; pos<endPos; pos++) {
    String gram = text.subSequence(pos, pos + gramLength).toString();
    if (filter==null || filter.use(gram)) {
      Integer counter = grams.get(gram);
      if (counter==null) {
        grams.put(gram, 1);
      } else {
        grams.put(gram, counter+1);
      }
    }
  }
}

/**
 * @return Key = ngram, value = count
 *         The order is as the n-grams appeared first in the string.
 *
 */
@NotNull
public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) {
  text = applyPadding(text);
  int len = text.length();
  int initialCapacity = 0;
  for (Integer gramLength : gramLengths) {
    initialCapacity += guessNumDistinctiveGrams(len, gramLength);
  }
  Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity);
  for (Integer gramLength : gramLengths) {
    _extractCounted(text, gramLength, len, grams);
  }
  return grams;
}

  public static void addCharSequence(LangProfile langProfile, CharSequence text) {
    //TODO replace with new code.

//        List<String> old = OldNgramExtractor.extractNGrams(text, null);
//        List<String> nuu = ngramExtractor.extractGrams(text);
//
//        Set<String> oldSet = new HashSet<>(old);
//        Set<String> nuuSet = new HashSet<>(nuu);
//
//        ArrayList<String> justNuu = new ArrayList<>(nuu);
//        justNuu.removeAll(old);
//
//        ArrayList<String> justOld = new ArrayList<>(old);
//        justOld.removeAll(nuu);
//
//        System.out.println(text);

//        for (String s : ngramExtractor.extractGrams(text)) {
//            langProfile.add(s);
//        }
    for (String s : OldNgramExtractor.extractNGrams(text, null)) {
      langProfile.add(s);
    }
  }

/**
 * In order to use this you must set the {@link #ngramExtractor} first.
 */
public LanguageProfileBuilder addText(CharSequence text) {
  if (ngramExtractor==null) {
    throw new IllegalStateException("NgramExtractor has not been set yet!");
  }
  for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
    addGram(entry.getKey(), entry.getValue());
  }
  return this;
}

/**
 * @throws IllegalStateException if no LanguageProfile was {@link #withProfile added}.
 */
public LanguageDetector build() throws IllegalStateException {
  if (languageProfiles.isEmpty()) throw new IllegalStateException();
  return new LanguageDetectorImpl(
      NgramFrequencyData.create(languageProfiles, ngramExtractor.getGramLengths()),
      alpha, seed, shortTextAlgorithm,
      prefixFactor, suffixFactor,
      probabilityThreshold, minimalConfidence,
      langWeightingMap,
      ngramExtractor
  );
}

private static com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles, Map<String, Float> languageProbabilities) {
  // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which
  // means you can often get 0 probabilities. So we pick a very short length for this limit.
  LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard())
      .shortTextAlgorithm(30)
      .withProfiles(languageProfiles);
  
  if (languageProbabilities != null) {
    Map<LdLocale, Double> languageWeights = new HashMap<>(languageProbabilities.size());
    for (String language : languageProbabilities.keySet()) {
      Double priority = (double)languageProbabilities.get(language);
      languageWeights.put(LdLocale.fromString(language), priority);
    }
    
    builder.languagePriorities(languageWeights);
  }
  
  return builder.build();
}

/**
 * To ensure having border grams, this character is added to the left and right of the text.
 *
 * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f"
 * are created.</p>
 *
 * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p>
 *
 * @param textPadding for example a space ' '.
 */
public NgramExtractor textPadding(char textPadding) {
  return new NgramExtractor(this.gramLengths, this.filter, textPadding);
}

private void _extractCounted(CharSequence text, int gramLength, int len, Map<String, Integer> grams) {
  int endPos = len - (gramLength -1);
  for (int pos=0; pos<endPos; pos++) {
    String gram = text.subSequence(pos, pos + gramLength).toString();
    if (filter==null || filter.use(gram)) {
      Integer counter = grams.get(gram);
      if (counter==null) {
        grams.put(gram, 1);
      } else {
        grams.put(gram, counter+1);
      }
    }
  }
}

  public static void addCharSequence(LangProfile langProfile, CharSequence text) {
    //TODO replace with new code.

//        List<String> old = OldNgramExtractor.extractNGrams(text, null);
//        List<String> nuu = ngramExtractor.extractGrams(text);
//
//        Set<String> oldSet = new HashSet<>(old);
//        Set<String> nuuSet = new HashSet<>(nuu);
//
//        ArrayList<String> justNuu = new ArrayList<>(nuu);
//        justNuu.removeAll(old);
//
//        ArrayList<String> justOld = new ArrayList<>(old);
//        justOld.removeAll(nuu);
//
//        System.out.println(text);

//        for (String s : ngramExtractor.extractGrams(text)) {
//            langProfile.add(s);
//        }
    for (String s : OldNgramExtractor.extractNGrams(text, null)) {
      langProfile.add(s);
    }
  }

/**
 * In order to use this you must set the {@link #ngramExtractor} first.
 */
public LanguageProfileBuilder addText(CharSequence text) {
  if (ngramExtractor==null) {
    throw new IllegalStateException("NgramExtractor has not been set yet!");
  }
  for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
    addGram(entry.getKey(), entry.getValue());
  }
  return this;
}

public static void loadModels(Path path) throws IOException {
  languageProfiles = new LanguageProfileReader().readAll(path.toFile());
  detector = LanguageDetectorBuilder.create(NgramExtractors.standard())
      .withProfiles(languageProfiles)
      .build();
  textObjectFactory = buildTextObjectFactory();
}

public NgramExtractor filter(NgramFilter filter) {
  return new NgramExtractor(this.gramLengths, filter, this.textPadding);
}

public static void loadBuiltInModels() throws IOException {
  languageProfiles = new LanguageProfileReader().readAllBuiltIn();
  detector = LanguageDetectorBuilder.create(NgramExtractors.standard())
      .withProfiles(languageProfiles)
      .build();
  textObjectFactory = buildTextObjectFactory();
}

public NgramExtractor filter(NgramFilter filter) {
  return new NgramExtractor(this.gramLengths, filter, this.textPadding);
}

public static NgramExtractor gramLength(int gramLength) {
  return new NgramExtractor(ImmutableList.of(gramLength), null, null);
}
public static NgramExtractor gramLengths(Integer... gramLength) {

public static NgramExtractor gramLengths(Integer... gramLength) {
  return new NgramExtractor(Arrays.asList(gramLength), null, null);
}

public static NgramExtractor gramLength(int gramLength) {
  return new NgramExtractor(ImmutableList.of(gramLength), null, null);
}
public static NgramExtractor gramLengths(Integer... gramLength) {

public static NgramExtractor gramLengths(Integer... gramLength) {
  return new NgramExtractor(Arrays.asList(gramLength), null, null);
}

How to use com.optimaize.langdetect.ngram

Best Java code snippets using com.optimaize.langdetect.ngram (Showing top 20 results out of 315)