/** * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}. * * @param stopwords a {@code String} comma separated stopwords list * @return the stopwords list as a char array set */ private static CharArraySet getStopwords(String stopwords) { List<String> stopwordsList = new ArrayList<>(); for (String stop : stopwords.split(",")) { stopwordsList.add(stop.trim()); } return new CharArraySet(stopwordsList, true); }
public RuleWithSetExceptions(String suffix, int min, String replacement, String[] exceptions) { super(suffix, min, replacement); for (int i = 0; i < exceptions.length; i++) { if (!exceptions[i].endsWith(suffix)) throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'"); } this.exceptions = new CharArraySet(Arrays.asList(exceptions), false); }
/** * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader) throws IOException { return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Creates a stopword set from the given stopword array. * * @param stopWords An array of stopwords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; }
/** * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @return A CharArraySet with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Creates a stopword set from the given stopword array. * * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 * @param stopWords An array of stopwords * @param ignoreCase If true, all words are lower cased first. * @return a Set containing the words */ public static CharArraySet makeStopSet(String[] stopWords, boolean ignoreCase) { CharArraySet stopSet = new CharArraySet(stopWords.length, ignoreCase); stopSet.addAll(Arrays.asList(stopWords)); return stopSet; }
/** * Creates a stopword set from the given stopword list. * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @param ignoreCase if true, all words are lower cased first * @return A Set ({@link CharArraySet}) containing the words */ public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){ CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase); stopSet.addAll(stopWords); return stopSet; }
/** * Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param matchVersion the Lucene {@link Version} * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getWordSet(Reader reader) throws IOException { return getWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Creates a stopword set from the given stopword list. * @param matchVersion Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 * @param stopWords A List of Strings or char[] or any other toString()-able list representing the stopwords * @param ignoreCase if true, all words are lower cased first * @return A Set ({@link CharArraySet}) containing the words */ public static CharArraySet makeStopSet(List<?> stopWords, boolean ignoreCase){ CharArraySet stopSet = new CharArraySet(stopWords.size(), ignoreCase); stopSet.addAll(stopWords); return stopSet; }
/** * Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting * leading and trailing whitespace). Every line of the Reader should contain only * one word. The words need to be in lowercase if you make use of an * Analyzer which uses LowerCaseFilter (like StandardAnalyzer). * * @param reader Reader containing the wordlist * @param comment The string representing a comment. * @param matchVersion the Lucene {@link Version} * @return A CharArraySet with the reader's words */ public static CharArraySet getWordSet(Reader reader, String comment) throws IOException { return getWordSet(reader, comment, new CharArraySet(INITIAL_CAPACITY, false)); }
public FeatureVector(Stopper stopper) { this.stopper = stopper; if(stopper==null || stopper.asSet().size()==0) { analyzer = new StandardAnalyzer(Version.LUCENE_41, CharArraySet.EMPTY_SET); } else { CharArraySet charArraySet = new CharArraySet(Version.LUCENE_41, stopper.asSet(), true); analyzer = new StandardAnalyzer(Version.LUCENE_41, charArraySet); } features = new HashMap<String,Double>(); }
/** * Reads stopwords from a stopword list in Snowball format. * <p> * The snowball format is the following: * <ul> * <li>Lines may contain multiple words separated by whitespace. * <li>The comment character is the vertical line (|). * <li>Lines may contain trailing comments. * </ul> * * @param reader Reader containing a Snowball stopword list * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getSnowballWordSet(Reader reader) throws IOException { return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Reads stopwords from a stopword list in Snowball format. * <p> * The snowball format is the following: * <ul> * <li>Lines may contain multiple words separated by whitespace. * <li>The comment character is the vertical line (|). * <li>Lines may contain trailing comments. * </ul> * </p> * * @param reader Reader containing a Snowball stopword list * @param matchVersion the Lucene {@link Version} * @return A {@link CharArraySet} with the reader's words */ public static CharArraySet getSnowballWordSet(Reader reader) throws IOException { return getSnowballWordSet(reader, new CharArraySet(INITIAL_CAPACITY, false)); }
/** * Returns an unmodifiable {@link CharArraySet}. This allows to provide * unmodifiable views of internal sets for "read-only" use. * * @param set * a set for which the unmodifiable set is returned. * @return an new unmodifiable {@link CharArraySet}. * @throws NullPointerException * if the given set is <code>null</code>. */ public static CharArraySet unmodifiableSet(CharArraySet set) { if (set == null) throw new NullPointerException("Given set is null"); if (set == EMPTY_SET) return EMPTY_SET; if (set.map instanceof CharArrayMap.UnmodifiableCharArrayMap) return set; return new CharArraySet(CharArrayMap.unmodifiableMap(set.map)); }
@Override public void inform(ResourceLoader loader) throws IOException { if (stopWordFiles != null) { if (FORMAT_WORDSET.equalsIgnoreCase(format)) { stopWords = getWordSet(loader, stopWordFiles, ignoreCase); } else if (FORMAT_SNOWBALL.equalsIgnoreCase(format)) { stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase); } else { throw new IllegalArgumentException("Unknown 'format' specified for 'words' file: " + format); } } else { if (null != format) { throw new IllegalArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format); } stopWords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase); } }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); }
/** * Returns as {@link CharArraySet} from wordFiles, which * can be a comma-separated list of filenames */ protected final CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException { List<String> files = splitFileNames(wordFiles); CharArraySet words = null; if (files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it that // big to start words = new CharArraySet(files.size() * 10, ignoreCase); for (String file : files) { List<String> wlist = getLines(loader, file.trim()); words.addAll(StopFilter.makeStopSet(wlist, ignoreCase)); } } return words; }
@Override protected TokenStreamComponents createComponents(final String field) { final Tokenizer source = new NGramTokenizer(minNgram(), maxNgram()); final TokenStream result = new StopFilter( new LowerCaseFilter(new StandardFilter(source)), new CharArraySet(asList(stopWords()), true)); return new TokenStreamComponents(source, result); }
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new ChineseTokenStream(reader); if (stopWordManager != null) { //走停止词过滤 CharArraySet stopWords = new CharArraySet(Version.LUCENE_CURRENT, stopWordManager.getStopWords(), true); TokenStream result = new StopFilter(Version.LUCENE_CURRENT, source, stopWords); return new TokenStreamComponents(source, result); } else { //走原始逻辑 return new TokenStreamComponents(source); } }
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new NewMMSegTokenizer(newSeg(), reader); if (stopWordManager != null) { //执行停止词逻辑 CharArraySet stopWords = new CharArraySet(Version.LUCENE_CURRENT, stopWordManager.getStopWords(), true); TokenStream result = new StopFilter(Version.LUCENE_CURRENT, source, stopWords); return new TokenStreamComponents(source, result); } else { //原有逻辑 return new TokenStreamComponents(source); } } }