/** * To ensure having border grams, this character is added to the left and right of the text. * * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f" * are created.</p> * * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p> * * @param textPadding for example a space ' '. */ public NgramExtractor textPadding(char textPadding) { return new NgramExtractor(this.gramLengths, this.filter, textPadding); }
/** * @return Key = ngram, value = count * The order is as the n-grams appeared first in the string. * */ @NotNull public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) { text = applyPadding(text); int len = text.length(); int initialCapacity = 0; for (Integer gramLength : gramLengths) { initialCapacity += guessNumDistinctiveGrams(len, gramLength); } Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity); for (Integer gramLength : gramLengths) { _extractCounted(text, gramLength, len, grams); } return grams; }
/** * @return null if there are no "features" in the text (just noise). */ @Nullable private double[] detectBlock(CharSequence text) { if (text.length() <= shortTextAlgorithm) { Map<String, Integer> ngrams = ngramExtractor.extractCountedGrams(text); if (ngrams.isEmpty()) return null; return detectBlockShortText(ngrams); } else { List<String> strings = ngramExtractor.extractGrams(text); if (strings.isEmpty()) return null; return detectBlockLongText(strings); } }
/** * @throws IllegalStateException if no LanguageProfile was {@link #withProfile added}. */ public LanguageDetector build() throws IllegalStateException { if (languageProfiles.isEmpty()) throw new IllegalStateException(); return new LanguageDetectorImpl( NgramFrequencyData.create(languageProfiles, ngramExtractor.getGramLengths()), alpha, seed, shortTextAlgorithm, prefixFactor, suffixFactor, probabilityThreshold, minimalConfidence, langWeightingMap, ngramExtractor ); }
/** * In order to use this you must set the {@link #ngramExtractor} first. */ public LanguageProfileBuilder addText(CharSequence text) { if (ngramExtractor==null) { throw new IllegalStateException("NgramExtractor has not been set yet!"); } for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) { addGram(entry.getKey(), entry.getValue()); } return this; }
text = applyPadding(text); int len = text.length();
/** * @return null if there are no "features" in the text (just noise). */ @Nullable private double[] detectBlock(CharSequence text) { if (text.length() <= shortTextAlgorithm) { Map<String, Integer> ngrams = ngramExtractor.extractCountedGrams(text); if (ngrams.isEmpty()) return null; return detectBlockShortText(ngrams); } else { List<String> strings = ngramExtractor.extractGrams(text); if (strings.isEmpty()) return null; return detectBlockLongText(strings); } }
/** * @throws IllegalStateException if no LanguageProfile was {@link #withProfile added}. */ public LanguageDetector build() throws IllegalStateException { if (languageProfiles.isEmpty()) throw new IllegalStateException(); return new LanguageDetectorImpl( NgramFrequencyData.create(languageProfiles, ngramExtractor.getGramLengths()), alpha, seed, shortTextAlgorithm, prefixFactor, suffixFactor, probabilityThreshold, minimalConfidence, langWeightingMap, ngramExtractor ); }
/** * In order to use this you must set the {@link #ngramExtractor} first. */ public LanguageProfileBuilder addText(CharSequence text) { if (ngramExtractor==null) { throw new IllegalStateException("NgramExtractor has not been set yet!"); } for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) { addGram(entry.getKey(), entry.getValue()); } return this; }
text = applyPadding(text); int len = text.length();
/** * @return Key = ngram, value = count * The order is as the n-grams appeared first in the string. * */ @NotNull public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) { text = applyPadding(text); int len = text.length(); int initialCapacity = 0; for (Integer gramLength : gramLengths) { initialCapacity += guessNumDistinctiveGrams(len, gramLength); } Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity); for (Integer gramLength : gramLengths) { _extractCounted(text, gramLength, len, grams); } return grams; }
/** * To ensure having border grams, this character is added to the left and right of the text. * * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f" * are created.</p> * * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p> * * @param textPadding for example a space ' '. */ public NgramExtractor textPadding(char textPadding) { return new NgramExtractor(this.gramLengths, this.filter, textPadding); }
/** * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug). */ public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) throws IllegalStateException { if (langsAdded.contains(languageProfile.getLocale())) { throw new IllegalStateException("A language profile for language "+languageProfile.getLocale()+" was added already!"); } for (Integer gramLength : ngramExtractor.getGramLengths()) { if (!languageProfile.getGramLengths().contains(gramLength)) { throw new IllegalArgumentException("The NgramExtractor is set to handle "+gramLength+"-grams but the given language profile for "+languageProfile.getLocale()+" does not support this!"); } } langsAdded.add(languageProfile.getLocale()); languageProfiles.add(languageProfile); return this; } /**
public NgramExtractor filter(NgramFilter filter) { return new NgramExtractor(this.gramLengths, filter, this.textPadding); }
/** * @throws IllegalStateException if a profile for the same language was added already (must be a userland bug). */ public LanguageDetectorBuilder withProfile(LanguageProfile languageProfile) throws IllegalStateException { if (langsAdded.contains(languageProfile.getLocale())) { throw new IllegalStateException("A language profile for language "+languageProfile.getLocale()+" was added already!"); } for (Integer gramLength : ngramExtractor.getGramLengths()) { if (!languageProfile.getGramLengths().contains(gramLength)) { throw new IllegalArgumentException("The NgramExtractor is set to handle "+gramLength+"-grams but the given language profile for "+languageProfile.getLocale()+" does not support this!"); } } langsAdded.add(languageProfile.getLocale()); languageProfiles.add(languageProfile); return this; } /**
public NgramExtractor filter(NgramFilter filter) { return new NgramExtractor(this.gramLengths, filter, this.textPadding); }
public static NgramExtractor gramLength(int gramLength) { return new NgramExtractor(ImmutableList.of(gramLength), null, null); } public static NgramExtractor gramLengths(Integer... gramLength) {
public static NgramExtractor gramLengths(Integer... gramLength) { return new NgramExtractor(Arrays.asList(gramLength), null, null); }
public static NgramExtractor gramLength(int gramLength) { return new NgramExtractor(ImmutableList.of(gramLength), null, null); } public static NgramExtractor gramLengths(Integer... gramLength) {
public static NgramExtractor gramLengths(Integer... gramLength) { return new NgramExtractor(Arrays.asList(gramLength), null, null); }