/** * Filters LowerCaseTokenizer with StopFilter. * @param fieldName name of field for which to create components * @return components for this analyzer */ @Override protected TokenStreamComponents createComponents(String fieldName) { JFlexTokenizer plainfull = new JFlexTokenizer(new PlainFullTokenizer( FileAnalyzer.dummyReader)); //we are counting position increments, this might affect the queries //later and need to be in sync, especially for highlighting of results return new TokenStreamComponents(plainfull, new StopFilter(plainfull, stopWords)); }
@Override protected TokenStreamComponents createComponents(final String fieldName) { final StandardTokenizer src = new StandardTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new LowerCaseFilter(src); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { // So that if maxTokenLength was changed, the change takes // effect next time tokenStream is called: src.setMaxTokenLength(StandardAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
@Override public TokenStream create(TokenStream tokenStream) { if (removeTrailing) { return new StopFilter(tokenStream, stopWords); } else { return new SuggestStopFilter(tokenStream, stopWords); } }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link LowerCaseTokenizer} filtered with * {@link StopFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new LowerCaseTokenizer(); return new TokenStreamComponents(source, new StopFilter(source, stopwords)); }
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); // run the widthfilter first before bigramming, it sometimes combines characters. TokenStream result = new CJKWidthFilter(source); result = new LowerCaseFilter(result); result = new CJKBigramFilter(result); return new TokenStreamComponents(source, new StopFilter(result, stopwords)); }
@Override protected TokenStreamComponents createComponents(final String fieldName) { final UAX29URLEmailTokenizer src = new UAX29URLEmailTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new LowerCaseFilter(src); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { // So that if maxTokenLength was changed, the change takes // effect next time tokenStream is called: src.setMaxTokenLength(UAX29URLEmailAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
@Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getTokenizer(), stopFilter); }
@Override protected TokenStreamComponents createComponents(final String fieldName) { final ClassicTokenizer src = new ClassicTokenizer(); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new ClassicFilter(src); tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) { src.setMaxTokenLength(ClassicAnalyzer.this.maxTokenLength); super.setReader(reader); } }; }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link ThaiTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter} and {@link StopFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new ThaiTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new DecimalDigitFilter(result); result = new StopFilter(result, stopwords); return new TokenStreamComponents(source, result); }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link GreekLowerCaseFilter}, * {@link StopFilter}, and {@link GreekStemFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new GreekLowerCaseFilter(source); result = new StopFilter(result, stopwords); result = new GreekStemFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link LatvianStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new LatvianStemFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, {@link GermanNormalizationFilter} and {@link GermanLightStemFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); result = new SetKeywordMarkerFilter(result, exclusionSet); result = new GermanNormalizationFilter(result); result = new GermanLightStemFilter(result); return new TokenStreamComponents(source, result); }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, * {@link StopFilter}, {@link SetKeywordMarkerFilter} * if a stem exclusion set is provided and {@link IndonesianStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) { result = new SetKeywordMarkerFilter(result, stemExclusionSet); } return new TokenStreamComponents(source, new IndonesianStemFilter(result)); }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , and {@link BrazilianStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(excltable != null && !excltable.isEmpty()) result = new SetKeywordMarkerFilter(result, excltable); return new TokenStreamComponents(source, new BrazilianStemFilter(result)); }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided, and {@link SnowballFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new org.tartarus.snowball.ext.RussianStemmer()); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new NorwegianStemmer()); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new RomanianStemmer()); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new HungarianStemmer()); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new ArmenianStemmer()); return new TokenStreamComponents(source, result); }
/** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link ElisionFilter}, {@link LowerCaseFilter}, * {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is * provided and {@link SnowballFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new ElisionFilter(source, DEFAULT_ARTICLES); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); if(!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet); result = new SnowballFilter(result, new CatalanStemmer()); return new TokenStreamComponents(source, result); }