@Override public PatternReplaceFilter create(TokenStream input) { return new PatternReplaceFilter(input, pattern, replacement, replaceAll); } }
@Override public TokenStream create(TokenStream tokenStream) { return new PatternReplaceFilter(tokenStream, pattern, replacement, all); } }
@Override public PatternReplaceFilter create(TokenStream input) { return new PatternReplaceFilter(input, pattern, replacement, replaceAll); } }
@Override public TokenStream create(TokenStream tokenStream) { return new PatternReplaceFilter(tokenStream, pattern, replacement, all); } }
public class AlphanumericAnalyzer extends Analyzer { @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new TrimFilter(new PatternReplaceFilter(new LowerCaseFilter(new KeywordTokenizer(reader)), Pattern.compile("[^a-zA-Z0-9]"), "", true), true); } }
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new WhitespaceTokenizer(); TokenStream result = new InchFilter(source); result = new PatternReplaceFilter(result, Pattern.compile("^\\p{Punct}*(.*?)\\p{Punct}*$"), "$1", true); result = new WordDelimiterGraphFilter(result, GermanSearchAnalyzer.DELIMITER_CONFIG_FLAGS, null); result = new LowerCaseFilter(result); return new TokenStreamComponents(source, result); } };
private TokenStream createStream(CorpusMetadata metadata, Tokenizer tokenizer) { TokenStream stream = new StandardFilter(tokenizer); stream = new LengthFilter(stream, (int) metadata.minTokenLength, (int) metadata.maxTokenLength); if (!metadata.stopWords.isEmpty()) { stream = getStopFilter(metadata.language, metadata.stopWords, stream); } if (metadata.applyStemmer > 0) { stream = getStemmerFilter(metadata.language, (int) metadata.applyStemmer, stream); } if (metadata.removeAccents) { stream = new ASCIIFoldingFilter(stream); } if (metadata.replaceNumbers) { stream = new PatternReplaceFilter(stream, NUMBER_PATTERN, NUMBER_PLACEHOLDER, false); } return stream; }
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new ReverseStringFilter(source); filter = new LengthFilter(filter, 2, Integer.MAX_VALUE); filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false); filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false); filter = new ReverseStringFilter(filter); return new TokenStreamComponents(source, filter); } };
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new KeywordTokenizer(); TokenStream filter = new ReverseStringFilter(source); filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false); filter = new ReverseStringFilter(filter); return new TokenStreamComponents(source, filter); } };
/** Uses Lucene's StandardAnalyzer and tuns the tokens through several lucene filters - LengthFilter: Filter individual words to be of length > minWordSize - ShingleFilter: Converts word stream into n-gram stream - PatternReplaceFilter: Removes the 'filler' character that ShingleFilter puts in to replace stopwords */ public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; TokenStream stream = analyzer.tokenStream(NOFIELD, input.get(0).toString()); LengthFilter filtered = new LengthFilter(Version.LUCENE_44, stream, minWordSize, Integer.MAX_VALUE); // Let words be long DataBag result; if (minGramSize == 1 && maxGramSize == 1) { result = fillBag(filtered); } else { ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize); nGramStream.setOutputUnigrams(outputUnigrams); PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true); result = fillBag(replacer); } return result; }
new PatternReplaceFilter(source, Pattern.compile("^\\p{Punct}*(.*?)\\p{Punct}*$"), "$1", true);
result = new PatternReplaceFilter(result, Pattern.compile("^\\p{Punct}*(.*?)\\p{Punct}*$"), "$1", true);
result = new PatternReplaceFilter(result, Pattern.compile("^\\p{Punct}*(.*?)\\p{Punct}*$"), "$1", true);
result = new PatternReplaceFilter(result, Pattern.compile("^\\p{Punct}*(.*?)\\p{Punct}*$"), "$1", true);
result = new PatternReplaceFilter(result, Pattern.compile("^\\p{Punct}*(.*?)\\p{Punct}*$"), "$1", true);