/** * Build a CharArraySet from an array of common words, appropriate for passing * into the CommonGramsFilter constructor. This permits this commonWords * construction to be cached once when an Analyzer is constructed. * * @see #makeCommonSet(java.lang.String[], boolean) passing false to * ignoreCase */ public static final CharArraySet makeCommonSet(String[] commonWords) { return makeCommonSet(commonWords, false); }
/** * Construct a token stream filtering the given input using an Array of common * words to create bigrams and is case-sensitive if ignoreCase is false. * * @param input Tokenstream in filter chain * @param commonWords words to be used in constructing bigrams * @param ignoreCase -Ignore case when constructing bigrams for common words. */ public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) { super(input); this.commonWords = (CharArraySet) makeCommonSet(commonWords, ignoreCase); init(); }
public void inform(ResourceLoader loader) { String commonWordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); if (commonWordFiles != null) { try { List<String> files = StrUtils.splitFileNames(commonWordFiles); if (commonWords == null && files.size() > 0){ //default stopwords list has 35 or so words, but maybe don't make it that big to start commonWords = new CharArraySet(files.size() * 10, ignoreCase); } for (String file : files) { List<String> wlist = loader.getLines(file.trim()); //TODO: once StopFilter.makeStopSet(List) method is available, switch to using that so we can avoid a toArray() call commonWords.addAll(CommonGramsFilter.makeCommonSet((String[])wlist.toArray(new String[0]), ignoreCase)); } } catch (IOException e) { throw new RuntimeException(e); } } else { commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet(StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase); } }
public void inform(ResourceLoader loader) { String commonWordFiles = args.get("words"); ignoreCase = getBoolean("ignoreCase", false); if (commonWordFiles != null) { try { List<String> files = StrUtils.splitFileNames(commonWordFiles); if (commonWords == null && files.size() > 0) { // default stopwords list has 35 or so words, but maybe don't make it // that big to start commonWords = new CharArraySet(files.size() * 10, ignoreCase); } for (String file : files) { List<String> wlist = loader.getLines(file.trim()); // TODO: once StopFilter.makeStopSet(List) method is available, switch // to using that so we can avoid a toArray() call commonWords.addAll(CommonGramsFilter.makeCommonSet((String[]) wlist .toArray(new String[0]), ignoreCase)); } } catch (IOException e) { throw new RuntimeException(e); } } else { commonWords = (CharArraySet) CommonGramsFilter.makeCommonSet( StopAnalyzer.ENGLISH_STOP_WORDS, ignoreCase); } }