com.optimaize.langdetect.ngram.NgramExtractor java code examples

/**
 * To ensure having border grams, this character is added to the left and right of the text.
 *
 * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f"
 * are created.</p>
 *
 * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p>
 *
 * @param textPadding for example a space ' '.
 */
public NgramExtractor textPadding(char textPadding) {
  return new NgramExtractor(this.gramLengths, this.filter, textPadding);
}

/**
 * @return Key = ngram, value = count
 *         The order is as the n-grams appeared first in the string.
 *
 */
@NotNull
public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) {
  text = applyPadding(text);
  int len = text.length();
  int initialCapacity = 0;
  for (Integer gramLength : gramLengths) {
    initialCapacity += guessNumDistinctiveGrams(len, gramLength);
  }
  Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity);
  for (Integer gramLength : gramLengths) {
    _extractCounted(text, gramLength, len, grams);
  }
  return grams;
}

/**
 * @return null if there are no "features" in the text (just noise).
 */
@Nullable
private double[] detectBlock(CharSequence text) {
  if (text.length() <= shortTextAlgorithm) {
    Map<String, Integer> ngrams = ngramExtractor.extractCountedGrams(text);
    if (ngrams.isEmpty()) return null;
    return detectBlockShortText(ngrams);
  } else {
    List<String> strings = ngramExtractor.extractGrams(text);
    if (strings.isEmpty()) return null;
    return detectBlockLongText(strings);
  }
}

/**
 * @throws IllegalStateException if no LanguageProfile was {@link #withProfile added}.
 */
public LanguageDetector build() throws IllegalStateException {
  if (languageProfiles.isEmpty()) throw new IllegalStateException();
  return new LanguageDetectorImpl(
      NgramFrequencyData.create(languageProfiles, ngramExtractor.getGramLengths()),
      alpha, seed, shortTextAlgorithm,
      prefixFactor, suffixFactor,
      probabilityThreshold, minimalConfidence,
      langWeightingMap,
      ngramExtractor
  );
}

/**
 * In order to use this you must set the {@link #ngramExtractor} first.
 */
public LanguageProfileBuilder addText(CharSequence text) {
  if (ngramExtractor==null) {
    throw new IllegalStateException("NgramExtractor has not been set yet!");
  }
  for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
    addGram(entry.getKey(), entry.getValue());
  }
  return this;
}

text = applyPadding(text);
int len = text.length();

/**
 * @return null if there are no "features" in the text (just noise).
 */
@Nullable
private double[] detectBlock(CharSequence text) {
  if (text.length() <= shortTextAlgorithm) {
    Map<String, Integer> ngrams = ngramExtractor.extractCountedGrams(text);
    if (ngrams.isEmpty()) return null;
    return detectBlockShortText(ngrams);
  } else {
    List<String> strings = ngramExtractor.extractGrams(text);
    if (strings.isEmpty()) return null;
    return detectBlockLongText(strings);
  }
}

/**
 * @throws IllegalStateException if no LanguageProfile was {@link #withProfile added}.
 */
public LanguageDetector build() throws IllegalStateException {
  if (languageProfiles.isEmpty()) throw new IllegalStateException();
  return new LanguageDetectorImpl(
      NgramFrequencyData.create(languageProfiles, ngramExtractor.getGramLengths()),
      alpha, seed, shortTextAlgorithm,
      prefixFactor, suffixFactor,
      probabilityThreshold, minimalConfidence,
      langWeightingMap,
      ngramExtractor
  );
}

/**
 * In order to use this you must set the {@link #ngramExtractor} first.
 */
public LanguageProfileBuilder addText(CharSequence text) {
  if (ngramExtractor==null) {
    throw new IllegalStateException("NgramExtractor has not been set yet!");
  }
  for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
    addGram(entry.getKey(), entry.getValue());
  }
  return this;
}

text = applyPadding(text);
int len = text.length();

/**
 * @return Key = ngram, value = count
 *         The order is as the n-grams appeared first in the string.
 *
 */
@NotNull
public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) {
  text = applyPadding(text);
  int len = text.length();
  int initialCapacity = 0;
  for (Integer gramLength : gramLengths) {
    initialCapacity += guessNumDistinctiveGrams(len, gramLength);
  }
  Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity);
  for (Integer gramLength : gramLengths) {
    _extractCounted(text, gramLength, len, grams);
  }
  return grams;
}

/**
 * To ensure having border grams, this character is added to the left and right of the text.
 *
 * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f"
 * are created.</p>
 *
 * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p>
 *
 * @param textPadding for example a space ' '.
 */
public NgramExtractor textPadding(char textPadding) {
  return new NgramExtractor(this.gramLengths, this.filter, textPadding);
}

/**
 * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
 */
public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) throws IllegalStateException {
  if (langsAdded.contains(languageProfile.getLocale())) {
    throw new IllegalStateException("A language profile for language "+languageProfile.getLocale()+" was added already!");
  }
  for (Integer gramLength : ngramExtractor.getGramLengths()) {
    if (!languageProfile.getGramLengths().contains(gramLength)) {
      throw new IllegalArgumentException("The NgramExtractor is set to handle "+gramLength+"-grams but the given language profile for "+languageProfile.getLocale()+" does not support this!");
    }
  }
  langsAdded.add(languageProfile.getLocale());
  languageProfiles.add(languageProfile);
  return this;
}
/**

public NgramExtractor filter(NgramFilter filter) {
  return new NgramExtractor(this.gramLengths, filter, this.textPadding);
}

/**
 * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug).
 */
public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) throws IllegalStateException {
  if (langsAdded.contains(languageProfile.getLocale())) {
    throw new IllegalStateException("A language profile for language "+languageProfile.getLocale()+" was added already!");
  }
  for (Integer gramLength : ngramExtractor.getGramLengths()) {
    if (!languageProfile.getGramLengths().contains(gramLength)) {
      throw new IllegalArgumentException("The NgramExtractor is set to handle "+gramLength+"-grams but the given language profile for "+languageProfile.getLocale()+" does not support this!");
    }
  }
  langsAdded.add(languageProfile.getLocale());
  languageProfiles.add(languageProfile);
  return this;
}
/**

public NgramExtractor filter(NgramFilter filter) {
  return new NgramExtractor(this.gramLengths, filter, this.textPadding);
}

public static NgramExtractor gramLength(int gramLength) {
  return new NgramExtractor(ImmutableList.of(gramLength), null, null);
}
public static NgramExtractor gramLengths(Integer... gramLength) {

public static NgramExtractor gramLengths(Integer... gramLength) {
  return new NgramExtractor(Arrays.asList(gramLength), null, null);
}

public static NgramExtractor gramLength(int gramLength) {
  return new NgramExtractor(ImmutableList.of(gramLength), null, null);
}
public static NgramExtractor gramLengths(Integer... gramLength) {

public static NgramExtractor gramLengths(Integer... gramLength) {
  return new NgramExtractor(Arrays.asList(gramLength), null, null);
}

Javadoc

Class for extracting n-grams out of a text. This class is immutable.

Most used methods

<init>
_extractCounted
applyPadding
extractCountedGrams
extractGrams
Creates the n-grams for a given text in the order they occur.Example: extractSortedGrams("Foo bar",
getGramLengths
guessNumDistinctiveGrams
This is trying to be smart. It also depends on script (alphabet less than ideographic). So I'm not s

Popular in Java

Parsing JSON documents to java classes using gson
getSupportFragmentManager (FragmentActivity)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
BigInteger (java.math)
An immutable arbitrary-precision signed integer.FAST CRYPTOGRAPHY This implementation is efficient f
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
Kernel (java.awt.image)
Top 12 Jupyter Notebook extensions

How to useNgramExtractor in com.optimaize.langdetect.ngram

Best Java code snippets using com.optimaize.langdetect.ngram.NgramExtractor (Showing top 20 results out of 315)

How to use
NgramExtractor
in
com.optimaize.langdetect.ngram