@Override public TokenStream create(TokenStream input) { return new Zemberek3StemFilter(input, morphology, strategy); }
static String stem(WordAnalysis results, String aggregation) { List<SingleAnalysis> alternatives = selectMorphemes(results, "minMorpheme"); List<String> candidates = morphToString(alternatives, "lemmas"); switch (aggregation) { case "maxLength": return Collections.max(candidates, Comparator.comparing(String::length)); case "minLength": return Collections.min(candidates, Comparator.comparing(String::length)); default: throw new RuntimeException("unknown strategy " + aggregation); } }
private static void parse(String word, TurkishMorphology morphology) { WordAnalysis results = morphology.analyze(word); System.out.println("Word = " + word + " has " + results.analysisCount() + " many solutions"); if (results.analysisCount() == 0) return; System.out.println("Parses: "); for (SingleAnalysis result : results) { System.out.println("number of morphemes = " + result.getMorphemeDataList().size()) ; System.out.println(result.formatLong()); System.out.println("\tStems = " + result.getStems()); System.out.println("\tLemmas = " + result.getLemmas()); System.out.println("\tStemAndEnding = " + result.getStemAndEnding()); System.out.println("-------------------"); } System.out.println("final selected stem : " + Zemberek3StemFilter.stem(results, "maxLength")); System.out.println("=================================="); }
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; if (keywordAttribute.isKeyword()) return true; /** * copied from {@link org.apache.lucene.analysis.br.BrazilianStemFilter#incrementToken} */ final String word = termAttribute.toString(); final WordAnalysis parses = morphology.analyze(word); if (parses.analysisCount() == 0) return true; final String s = stem(parses, aggregation); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.equals(word)) termAttribute.setEmpty().append(s); return true; } }