/** * @return language or {@code null} if language could not be identified */ @Nullable private Map.Entry<String, Double> detectLanguageCode(String text) { List<com.optimaize.langdetect.DetectedLanguage> lang = languageDetector.getProbabilities(text); // comment in for debugging: //System.out.println(languageDetector.getProbabilities(textObject)); if (lang.size() > 0) { String code = lang.get(0).getLocale().getLanguage(); double prob = lang.get(0).getProbability(); return new AbstractMap.SimpleImmutableEntry<>(code, prob); } else { return null; } }
private static String makeLanguageName(LdLocale locale) { return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), locale.getRegion().orNull()); }
private String getLangString(DetectedLanguage detectedLanguage) { //So that we have mapping between lang id and common-tokens file names String lang = detectedLanguage.getLocale().getLanguage(); if ("zh".equals(lang)) { if (detectedLanguage.getLocale().getRegion().isPresent()) { lang += "-" + detectedLanguage.getLocale().getRegion().get().toLowerCase(Locale.US); } else { //hope for the best lang += "-cn"; } } return lang; }
@Test(timeout = 10000) public void testDefenseAgainstBadRegexInOptimaize() throws Exception { //TIKA-2777 StringBuilder sb = new StringBuilder(); for (int i = 0; i < 50000; i++) { sb.append("a"); } LanguageIDWrapper.loadBuiltInModels(); Optional<LdLocale> optional = LanguageIDWrapper.detect(sb.toString()); Assert.assertEquals("so", optional.get().getLanguage()); } }
@Override public boolean apply(LanguageProfile languageProfile) { return languageProfile.getLocale().getLanguage().equals(isoString); } });
public boolean apply(DetectedLanguage language) { return language.getLocale().getLanguage().equals(languageProfile.getLocale().getLanguage()); } });
@Override public boolean apply(LanguageProfile languageProfile) { return languageProfile.getLocale().getLanguage().equals(isoString); } });
public boolean apply(DetectedLanguage language) { return language.getLocale().getLanguage().equals(languageProfile.getLocale().getLanguage()); } });
public synchronized String detectLanguage(String text) { try { TextObject textObject = textObjectFactory.forText(text); Optional<LdLocale> locale = languageDetector.detect(textObject); if(locale.isPresent()) return locale.get().getLanguage(); } catch(Exception e) {} return ""; }
private static String detectLanguage(List<DetectedLanguage> languages) { if (languages.size() < 1) return null; DetectedLanguage lang = languages.get(0); if (lang.getProbability() < 0.50d && languages.size() > 1) return null; return lang.getLocale().getLanguage(); }
/** * identify language of a text * @param text inserted * @return identified language (or enpty if no language could be detected) */ public String identifyLanguage(String text) { TextObject textObject = textObjectFactory.forText(text); Optional<LdLocale> lang = languageDetector.detect(textObject); // no language present? if (lang.isPresent()) return lang.get().getLanguage(); return ""; // fallback to none } }
private static Language guessLanguageUsingOptimaize(String input) { Optional<LdLocale> result = languageDetector.detect(textObjectFactory.forText(input)); if ( ! result.isPresent()) return Language.UNKNOWN; return Language.fromLocale(new Locale(result.get().getLanguage())); }
/** * @return language or {@code null} if language could not be identified */ @Nullable private Map.Entry<String, Double> detectLanguageCode(String text) { List<com.optimaize.langdetect.DetectedLanguage> lang = languageDetector.getProbabilities(text); // comment in for debugging: //System.out.println(languageDetector.getProbabilities(textObject)); if (lang.size() > 0) { String code = lang.get(0).getLocale().getLanguage(); double prob = lang.get(0).getProbability(); return new AbstractMap.SimpleImmutableEntry<>(code, prob); } else { return null; } }
public String detectLanguage(String text) { if (!initialized) { initialize(); } if (text != null) { TextObject textObject = textObjectFactory.forText(text); Optional<LdLocale> lang = languageDetector.detect(textObject); if (lang.isPresent()) { return lang.get().getLanguage(); } } return LANGUAGE_NA; }
private static String makeLanguageName(LdLocale locale) { return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), locale.getRegion().orNull()); }
private String makeLanguageName(LdLocale locale) { return LanguageNames.makeName(locale.getLanguage(), locale.getScript().orNull(), locale.getRegion().orNull()); }
/** * Détermine une langue à partir d'une source textuelle * @param source Source textuelle * @return Langue majoritaire détectée au sein de la source */ private static Language getLanguageFrom(String source){ TextObject textObject = CommonTextObjectFactories.forDetectingOnLargeText().forText(source); LdLocale lang = getLanguageDetector().detect(textObject) .or(LdLocale.fromString(Constants.defaultLanguage.getAbrev().toLowerCase())); for(Language lg : Language.values()) if(lang.getLanguage().equals(lg.getAbrev().toLowerCase())) return lg; return Constants.defaultLanguage; }
this.removeLanguageProfile(this.languageProfileBuilder.build().getLocale().getLanguage());
public static String detectLanguage(String text) throws IOException{ if(languageDetector == null) { languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .shortTextAlgorithm(0) .withProfiles(new LanguageProfileReader().readAllBuiltIn()) .build(); } List<DetectedLanguage> detectedLanguages = languageDetector.getProbabilities(text); if(detectedLanguages.size() > 0) return detectedLanguages.get(0).getLocale().getLanguage(); return "N/A"; }
String code = lang.getLocale().getLanguage(); parse.get(url).getMetadata().addValue(mdKey, code);