/** * @param maxLength the maximum number of characters that will be considered - can help * with performance. Don't use values below 100, as this would decrease * accuracy. * @throws IllegalArgumentException if {@code maxLength} is less than 10 * @since 4.2 */ public LanguageIdentifier(int maxLength) { if (maxLength < 10) { throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength); } this.maxLength = maxLength; try { List<LanguageProfile> profiles = loadProfiles(getLanguageCodes()); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .minimalConfidence(MINIMAL_CONFIDENCE) .shortTextAlgorithm(SHORT_ALGO_THRESHOLD) .withProfiles(profiles) .build(); textObjectFactory = new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .withTextFilter(new RemoveEMailSignatureFilter()) .build(); } catch (IOException e) { throw new RuntimeException("Could not set up language identifier", e); } }
public static TextObjectFactory forIndexing() { return new TextObjectFactoryBuilder() .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
public static TextObjectFactory forIndexing() { return new TextObjectFactoryBuilder() .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
public static TextObjectFactory forDetectingOnLargeText() { return new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
public static TextObjectFactory forDetectingOnLargeText() { return new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .build(); }
/** * @param maxLength the maximum number of characters that will be considered - can help * with performance. Don't use values below 100, as this would decrease * accuracy. * @throws IllegalArgumentException if {@code maxLength} is less than 10 * @since 4.2 */ public LanguageIdentifier(int maxLength) { if (maxLength < 10) { throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + maxLength); } this.maxLength = maxLength; try { List<LanguageProfile> profiles = loadProfiles(getLanguageCodes()); languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) .minimalConfidence(MINIMAL_CONFIDENCE) .shortTextAlgorithm(SHORT_ALGO_THRESHOLD) .withProfiles(profiles) .build(); textObjectFactory = new TextObjectFactoryBuilder() .maxTextLength(10000) .withTextFilter(UrlTextFilter.getInstance()) .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) .withTextFilter(new RemoveEMailSignatureFilter()) .build(); } catch (IOException e) { throw new RuntimeException("Could not set up language identifier", e); } }