public List<String> getStemmedPairs(final String text) throws IOException { String tmp = text.toLowerCase(); tmp = DiacriticsRemover.removeDiacritics(tmp); tmp = tmp.replaceAll("_", SPACE); tmp = tmp.replaceAll("\n", SPACE); tmp = tmp.replaceAll("[^a-z\\d-_/ ]", ""); List<String> strings = new ArrayList<String>(); PorterStemmer ps = new PorterStemmer(); for (String s : StringUtils.split(tmp, SPACE)) { if (!StopWordsRemover.isAnEnglishStopWords(s)) {; ps.add(s.toCharArray(), s.length()); ps.stem(); strings.add(ps.toString()); } } return strings; }
ps.add(s.toCharArray(), s.length()); ps.stem(); strings.add(ps.toString());
stemmer.stem(); context.write(new TextArrayWritable(new Text[]{key, new Text(stemmer.toString())}), one);
String[] to = new String[]{key, ps.toString()}; alt.add(TupleFactory.getInstance().newTuple(Arrays.asList(to)));