@Override public Analyzer createAnalyzer() { return new EnglishAnalyzer(); }
public EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer(Analysis.parseStopWords(env, settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
@Override protected CharArraySet build() { return EnglishAnalyzer.getDefaultStopSet(); } },
@SuppressWarnings("resource") @Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) throws IOException { if (stemsAllowed) { CharArraySet stopWords = stopWordsAllowed ? EnglishAnalyzer.getDefaultStopSet() : CharArraySet.EMPTY_SET; return new EnglishAnalyzer(stopWords).tokenStream("", new StringReader(strOrig)); } else { return getStandardTokenStream(strOrig); } } }
protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source; if (getVersion().onOrAfter(Version.LUCENE_4_7_0)) { source = new StandardTokenizer(); } else {
public EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop words. // We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, line); Iterator<String> it = token.iterator(); while (it.hasNext()) { word.set(it.next()); fileName.set(key); if (!mapTable.containsKey(fileName.toString() + word.toString())) { context.write(word, fileName); mapTable.put(fileName.toString() + word.toString(), new IntWritable(1)); } } }
EnglishAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); analyzer = new EnglishAnalyzer( Analysis.parseStopWords(env, indexSettings.getIndexVersionCreated(), settings, EnglishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET) ); analyzer.setVersion(version); }
public void map(Text key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); CharArraySet noise = EnglishAnalyzer.getDefaultStopSet(); // We are using a standard tokenizer that eliminates the stop words. // We can use Stemming tokenizer such Porter // A set of English noise keywords is used that will eliminates // words such as "the, a, etc" Analyzer analyzer = new StandardAnalyzer(noise); List<String> token = Tokenizer.tokenizeString(analyzer, line); Iterator<String> it = token.iterator(); while (it.hasNext()) { word.set(it.next()); fileName.set(key); if (!mapTable.containsKey(fileName.toString() + word.toString())) { context.write(fileName, word); mapTable.put(fileName.toString() + word.toString(), new IntWritable(1)); } } } }
@Override protected Analyzer build() { return new EnglishAnalyzer(); } },
protected TokenStream createTokenStream(String text) { Set<?> luceneStopWords = this.stopWords == null ? EnglishAnalyzer.getDefaultStopSet() : StopFilter.makeStopSet(LUCENE_VERSION, stopWords); Analyzer analyzer = new EnglishSpecialAnalyzer(LUCENE_VERSION, luceneStopWords, this.stemExclusionsSet); TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text)); if (this.nGram) { tokenStream = new ShingleFilter(tokenStream, this.minNGram, this.maxNGram); } return tokenStream; }
@Override protected Analyzer getAnalyzer() { return new EnglishAnalyzer(); }
for (int i = 0; i < lines.size(); i++) { CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();
/** * @param doAnalyze * @param lang * @return */ public TextTransformer withDoAnalyze(boolean doAnalyze, String lang) { this.doAnalyze = doAnalyze; if (lang.equals("en")) { analyzer = new EnglishAnalyzer(Version.LUCENE_44); } else { throw new IllegalArgumentException("unsupported language:" + lang); } return this; }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(matchVersion, result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new StopFilter(matchVersion, result, EnglishAnalyzer.getDefaultStopSet()); result = new LowerCaseFilter(matchVersion, result); // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); } }
QueryParser parser = new QueryParser("Body", new EnglishAnalyzer()); Query query = parser.parse(topic); TopDocs hits = iSearcher.search(query, 1000); for (int i=0; i<hits.scoreDocs.length; i++){ Terms termVector = iSearcher.getIndexReader().getTermVector(hits.scoreDocs[i].doc, "Body"); Document doc = iSearcher.doc(hits.scoreDocs[i].doc); documentsList.put(doc, termVector); }
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new StandardTokenizer(); TokenStream result = new StandardFilter(source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new LowerCaseFilter(result); result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet()); result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); }
@Override protected Analyzer create(Version version) { Analyzer a = new EnglishAnalyzer(); a.setVersion(version.luceneVersion); return a; } },
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(matchVersion, reader); TokenStream result = new StandardFilter(matchVersion, source); result = new ASCIIFoldingFilter(result); result = new EnglishPossessiveFilter(matchVersion, result); result = new WordDelimiterFilter(result,WordDelimiterFilter.ALPHA,null); result = new StopFilter(matchVersion, result, EnglishAnalyzer.getDefaultStopSet()); result = new LowerCaseFilter(matchVersion, result); // result = new PorterStemFilter(result); return new TokenStreamComponents(source, result); } }
public SimpleSearcher(String indexDir) throws IOException { Path indexPath = Paths.get(indexDir); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException(indexDir + " does not exist or is not a directory."); } this.reader = DirectoryReader.open(FSDirectory.open(indexPath)); this.similarity = new LMDirichletSimilarity(1000.0f); this.analyzer = new EnglishAnalyzer(); this.searchtweets = false; this.isRerank = false; setDefaultReranker(); }