/** * @param maxLength the maximum number of characters that will be considered - can help * with performance. Don't use values below 100, as this would decrease * accuracy. * @throws IllegalArgumentException if {@code maxLength} is less than 10 * @since 4.2 */ public LanguageIdentifier(int maxLength) { if (maxLength < 10) { throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength); } this.maxLength = maxLength; try { List<LanguageProfile> profiles = loadProfiles(getLanguageCodes()); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .minimalConfidence(MINIMAL_CONFIDENCE) .shortTextAlgorithm(SHORT_ALGO_THRESHOLD) .withProfiles(profiles) .build(); textObjectFactory = new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .withTextFilter(new RemoveEMailSignatureFilter()) .build(); } catch (IOException e) { throw new RuntimeException("Could not set up language identifier", e); } }
/** * To ensure having border grams, this character is added to the left and right of the text. * * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f" * are created.</p> * * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p> * * @param textPadding for example a space ' '. */ public NgramExtractor textPadding(char textPadding) { return new NgramExtractor(this.gramLengths, this.filter, textPadding); }
private void _extractCounted(CharSequence text, int gramLength, int len, Map<String, Integer> grams) { int endPos = len - (gramLength -1); for (int pos=0; pos<endPos; pos++) { String gram = text.subSequence(pos, pos + gramLength).toString(); if (filter==null || filter.use(gram)) { Integer counter = grams.get(gram); if (counter==null) { grams.put(gram, 1); } else { grams.put(gram, counter+1); } } } }
/** * @return Key = ngram, value = count * The order is as the n-grams appeared first in the string. * */ @NotNull public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) { text = applyPadding(text); int len = text.length(); int initialCapacity = 0; for (Integer gramLength : gramLengths) { initialCapacity += guessNumDistinctiveGrams(len, gramLength); } Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity); for (Integer gramLength : gramLengths) { _extractCounted(text, gramLength, len, grams); } return grams; }
public static void addCharSequence(LangProfile langProfile, CharSequence text) { //TODO replace with new code. // List<String> old = OldNgramExtractor.extractNGrams(text, null); // List<String> nuu = ngramExtractor.extractGrams(text); // // Set<String> oldSet = new HashSet<>(old); // Set<String> nuuSet = new HashSet<>(nuu); // // ArrayList<String> justNuu = new ArrayList<>(nuu); // justNuu.removeAll(old); // // ArrayList<String> justOld = new ArrayList<>(old); // justOld.removeAll(nuu); // // System.out.println(text); // for (String s : ngramExtractor.extractGrams(text)) { // langProfile.add(s); // } for (String s : OldNgramExtractor.extractNGrams(text, null)) { langProfile.add(s); } }
/** * In order to use this you must set the {@link #ngramExtractor} first. */ public LanguageProfileBuilder addText(CharSequence text) { if (ngramExtractor==null) { throw new IllegalStateException("NgramExtractor has not been set yet!"); } for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) { addGram(entry.getKey(), entry.getValue()); } return this; }
/** * @throws IllegalStateException if no LanguageProfile was {@link #withProfile added}. */ public LanguageDetector build() throws IllegalStateException { if (languageProfiles.isEmpty()) throw new IllegalStateException(); return new LanguageDetectorImpl( NgramFrequencyData.create(languageProfiles, ngramExtractor.getGramLengths()), alpha, seed, shortTextAlgorithm, prefixFactor, suffixFactor, probabilityThreshold, minimalConfidence, langWeightingMap, ngramExtractor ); }
private static com.optimaize.langdetect.LanguageDetector createDetector(List<LanguageProfile> languageProfiles, Map<String, Float> languageProbabilities) { // FUTURE currently the short text algorithm doesn't normalize probabilities until the end, which // means you can often get 0 probabilities. So we pick a very short length for this limit. LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(30) .withProfiles(languageProfiles); if (languageProbabilities != null) { Map<LdLocale, Double> languageWeights = new HashMap<>(languageProbabilities.size()); for (String language : languageProbabilities.keySet()) { Double priority = (double)languageProbabilities.get(language); languageWeights.put(LdLocale.fromString(language), priority); } builder.languagePriorities(languageWeights); } return builder.build(); }
/** * To ensure having border grams, this character is added to the left and right of the text. * * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f" * are created.</p> * * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p> * * @param textPadding for example a space ' '. */ public NgramExtractor textPadding(char textPadding) { return new NgramExtractor(this.gramLengths, this.filter, textPadding); }
private void _extractCounted(CharSequence text, int gramLength, int len, Map<String, Integer> grams) { int endPos = len - (gramLength -1); for (int pos=0; pos<endPos; pos++) { String gram = text.subSequence(pos, pos + gramLength).toString(); if (filter==null || filter.use(gram)) { Integer counter = grams.get(gram); if (counter==null) { grams.put(gram, 1); } else { grams.put(gram, counter+1); } } } }
public static void addCharSequence(LangProfile langProfile, CharSequence text) { //TODO replace with new code. // List<String> old = OldNgramExtractor.extractNGrams(text, null); // List<String> nuu = ngramExtractor.extractGrams(text); // // Set<String> oldSet = new HashSet<>(old); // Set<String> nuuSet = new HashSet<>(nuu); // // ArrayList<String> justNuu = new ArrayList<>(nuu); // justNuu.removeAll(old); // // ArrayList<String> justOld = new ArrayList<>(old); // justOld.removeAll(nuu); // // System.out.println(text); // for (String s : ngramExtractor.extractGrams(text)) { // langProfile.add(s); // } for (String s : OldNgramExtractor.extractNGrams(text, null)) { langProfile.add(s); } }
/** * In order to use this you must set the {@link #ngramExtractor} first. */ public LanguageProfileBuilder addText(CharSequence text) { if (ngramExtractor==null) { throw new IllegalStateException("NgramExtractor has not been set yet!"); } for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) { addGram(entry.getKey(), entry.getValue()); } return this; }
public static void loadModels(Path path) throws IOException { languageProfiles = new LanguageProfileReader().readAll(path.toFile()); detector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); textObjectFactory = buildTextObjectFactory(); }
public NgramExtractor filter(NgramFilter filter) { return new NgramExtractor(this.gramLengths, filter, this.textPadding); }
public static void loadBuiltInModels() throws IOException { languageProfiles = new LanguageProfileReader().readAllBuiltIn(); detector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .withProfiles(languageProfiles) .build(); textObjectFactory = buildTextObjectFactory(); }
public NgramExtractor filter(NgramFilter filter) { return new NgramExtractor(this.gramLengths, filter, this.textPadding); }
public static NgramExtractor gramLength(int gramLength) { return new NgramExtractor(ImmutableList.of(gramLength), null, null); } public static NgramExtractor gramLengths(Integer... gramLength) {
public static NgramExtractor gramLengths(Integer... gramLength) { return new NgramExtractor(Arrays.asList(gramLength), null, null); }
public static NgramExtractor gramLength(int gramLength) { return new NgramExtractor(ImmutableList.of(gramLength), null, null); } public static NgramExtractor gramLengths(Integer... gramLength) {
public static NgramExtractor gramLengths(Integer... gramLength) { return new NgramExtractor(Arrays.asList(gramLength), null, null); }