@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length()); return true; } else return false; } }
public final class LuceneUtil { private LuceneUtil() {} public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; } }
public final class LuceneUtils { public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) { List<String> result = new ArrayList<String>(); TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords)); try { while(stream.incrementToken()) { result.add(stream.getAttribute(TermAttribute.class).term()); } } catch(IOException e) { // not thrown b/c we're using a string reader... } return result; } }
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
public static List<String> keywords( String source ) { List<String> keywords = new ArrayList<String>(); TokenStream ts = null; try { ts = analyzer.tokenStream( "keywords", new StringReader( source ) ); ts.reset(); while ( ts.incrementToken() ) { keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() ); } ts.end(); } catch ( IOException e ) { logger.error( "Error getting keywords ", e ); } finally { try { ts.close(); } catch (IOException ignored) {} } return keywords; } }
/** * Creates simple boolean query from the cached tokenstream contents */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); stream.reset(); List<Term> terms = new ArrayList<>(); while (stream.incrementToken()) { terms.add(new Term(field, termAtt.getBytesRef())); } return newSynonymQuery(terms.toArray(new Term[terms.size()])); }
TokenStream stream = analyzer.tokenStream(null, new StringReader(text)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { System.out.println(cattr.toString()); } stream.end(); stream.close();
private void fillCache() throws IOException { while (input.incrementToken()) { cache.add(captureState()); } // capture final state input.end(); finalState = captureState(); }
uniqueTerms = new CharArraySet(8, false); int outputTokenSize = 0; while (input.incrementToken()) { if (outputTokenSize > maxOutputTokenSize) { continue; final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); input.end(); inputEnded = true;
if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
final LinkedList<String> tokens = getTokens(); final CharTermAttribute termAtt = getTermAtt(); if (tokens.isEmpty() && input.incrementToken()) { final String text = new String(termAtt.buffer(), 0, termAtt.length()); if (UrlStringUtils.containsUrl(text)) { final String[] parts = text.split("\\s");
try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); reuse.setLength(reuse.length() + 1); System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length); ts.end();
/** * @param text * @return */ public Set<String> getToken(String text) { Set<String> list = new LinkedHashSet<>(); if (CommonUtils.notEmpty(text)) { try (StringReader stringReader = new StringReader(text); TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) { CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { list.add(charTermAttribute.toString()); } tokenStream.end(); return list; } catch (IOException e) { return list; } } return list; }
/** * Parses the query. Using this instead of a QueryParser in order * to avoid thread-safety issues with Lucene's query parser. * * @param fieldName the name of the field * @param value the value of the field * @return the parsed query */ private Query parseTokens(String fieldName, String value) { BooleanQuery searchQuery = new BooleanQuery(); if (value != null) { Analyzer analyzer = new KeywordAnalyzer(); try { TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(value)); tokenStream.reset(); CharTermAttribute attr = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String term = attr.toString(); Query termQuery = new TermQuery(new Term(fieldName, term)); searchQuery.add(termQuery, Occur.SHOULD); } } catch (IOException e) { throw new DukeException("Error parsing input string '" + value + "' " + "in field " + fieldName); } } return searchQuery; }
/** * Creates simple term query from the cached tokenstream contents */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } return newTermQuery(new Term(field, termAtt.getBytesRef())); }
new StringTokenStream(attributeFactory, filteredText, text.length()))) { final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); ts.reset(); if (ts.incrementToken() == false) { throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " if (ts.incrementToken()) { throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\""); ts.end(); return term;
@Override public boolean incrementToken() throws IOException { while (sources[currentSource].incrementToken() == false) { if (currentSource >= sources.length - 1) return false; sources[currentSource].end(); OffsetAttribute att = sourceOffsets[currentSource]; if (att != null) offsetIncrement += att.endOffset(); currentSource++; } clearAttributes(); sources[currentSource].copyTo(this); offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement); return true; }
boolean found = false; while (input.incrementToken()) { found = true; String current = new String(termAttribute.buffer(), 0, termAttribute.length()); input.end();
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException { TokenStream ts = analyzer.tokenStream("text", in); ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String s = ts.getAttribute(CharTermAttribute.class).toString(); words.add(s); } /*overallCounts.addAll(words);*/ } }
@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword() && termAttribute.length() > length) termAttribute.setLength(length); return true; } else { return false; } } }