@Override protected CharArraySet build() { return EnglishAnalyzer.getDefaultStopSet(); } },
public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop words. // We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, line); Iterator<String> it = token.iterator(); while (it.hasNext()) { word.set(it.next()); fileName.set(key); if (!mapTable.containsKey(fileName.toString() + word.toString())) { context.write(word, fileName); mapTable.put(fileName.toString() + word.toString(), new IntWritable(1)); } } }
public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop words. // We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, line); Iterator<String> it = token.iterator(); while (it.hasNext()) { word.set(it.next()); fileName.set(key); if (!mapTable.containsKey(fileName.toString() + word.toString())) { context.write(fileName, word); mapTable.put(fileName.toString() + word.toString(), new IntWritable(1)); } } } }
@SuppressWarnings("resource") @Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException { if (stemsAllowed) { CharArraySet stopWords = stopWordsAllowed ? EnglishAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET; return new EnglishAnalyzer(stopWords).tokenStream("", new StringReader(strOrig)); } else { return getStandardTokenStream(strOrig); } } }
protected TokenStream createTokenStream(String text) { Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet() : StopFilter.makeStopSet(LUCENE_VERSION, stopWords); Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); if (this.nGram) { tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram); } return tokenStream; }
public EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
public EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(matchVersion, result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new StopFilter(matchVersion, result, EnglishAnalyzer.getDefaultStopSet()); result = new LowerCaseFilter(matchVersion, result); // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); } }
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(matchVersion, result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new StopFilter(matchVersion, result, EnglishAnalyzer.getDefaultStopSet()); result = new LowerCaseFilter(matchVersion, result); // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); } }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(matchVersion, result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new LowerCaseFilter(matchVersion, result); result = new StopFilter(matchVersion, result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
@Override public TokenStream getTokenStream(Tokenizer tokenizer, CharArraySet stemExclusionSet) { TokenStream stream = new StandardFilter(matchVersion, tokenizer); if (caseInsensitive) { stream = new LowerCaseFilter(matchVersion, stream); } if (useStopWords) { stream = new StopFilter(matchVersion, stream, EnglishAnalyzer.getDefaultStopSet()); } if (useStem) { if (!stemExclusionSet.isEmpty()) stream = new SetKeywordMarkerFilter(stream, stemExclusionSet); stream = new EnglishPossessiveFilter(matchVersion, stream); stream = new PorterStemFilter(stream); } return stream; } }
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }