@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly); } }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly); }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, true, longestOnly); } }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly); }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, dedup); }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly); }
@Override public TokenStream create(TokenStream tokenStream) { return new HunspellStemFilter(tokenStream, dictionary, dedup, longestOnly); }
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException { StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(strOrig)); if (stemsAllowed) { Dictionary dictionary = getDict(); if (dictionary == null) { return tokenizer; } return new HunspellStemFilter(tokenizer, dictionary); /// TODO: implement stop words checks } else { return tokenizer; } }
/** * Tries to stem the provided primary word using a {@link HunspellStemFilter}. * * @param primaryWordTermAttr a {@link CharTermAttribute} containing the primary word * @throws IOException */ private void tryStemWithHunspell(CharTermAttribute primaryWordTermAttr) throws IOException { try (TokenStream hunspellStream = new HunspellStemFilter(new SingleTermTokenStream(primaryWordTermAttr.buffer(), primaryWordTermAttr.length()), hunspellDict, true, hunspellLongestOnly)) { final CharTermAttribute hunspellTermAttr = hunspellStream.addAttribute(CharTermAttribute.class); hunspellStream.reset(); hunspellStream.incrementToken(); char[] stemmedPrimaryWord = hunspellTermAttr.buffer(); int stemmedPrimaryWordLength = hunspellTermAttr.length(); if (!Chars.arrayEquals(stemmedPrimaryWord, stemmedPrimaryWordLength, primaryWordTermAttr.buffer(), primaryWordTermAttr.length())) { reAttachStemmedPrimaryWord(hunspellTermAttr.buffer(), hunspellTermAttr.length(), primaryWordTermAttr.length()); } hunspellStream.end(); } }
private boolean stem() throws IOException { char[] tokenToStem = this.termAttr.buffer(); int tokenToStemLength = this.termAttr.length(); // trie to stem the current term using the hunspell-stemmer try (TokenStream hunspellStream = new HunspellStemFilter(new SingleTermTokenStream(termAttr.buffer(), termAttr.length()), hunspellDict, true, hunspellLongestOnly)) { final CharTermAttribute hunspellTermAttr = hunspellStream.addAttribute(CharTermAttribute.class); hunspellStream.reset(); if (!hunspellStream.incrementToken()) { return false; } char[] stemmed = hunspellTermAttr.buffer(); int stemmedLength = hunspellTermAttr.length(); if (Chars.arrayEquals(tokenToStem, tokenToStemLength, stemmed, stemmedLength)) { // if the token didn't change during stemming, we try to extract the primary word and stem this term instead if (!tryStemPrimaryWord()) { return false; } } else { // write the stem to the term attribute this.termAttr.copyBuffer(hunspellTermAttr.buffer(), 0, hunspellTermAttr.length()); } hunspellStream.end(); } return true; }