public final class LuceneUtil { private LuceneUtil() {} public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; } }
TokenStream stream = analyzer.tokenStream(null, new StringReader(text)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { System.out.println(cattr.toString()); } stream.end(); stream.close();
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); }
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); }
/** * @param text * @return */ public Set<String> getToken(String text) { Set<String> list = new LinkedHashSet<>(); if (CommonUtils.notEmpty(text)) { try (StringReader stringReader = new StringReader(text); TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) { CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { list.add(charTermAttribute.toString()); } tokenStream.end(); return list; } catch (IOException e) { return list; } } return list; }
try (Reader reader = new StringReader(text)) { Reader filterReader = initReaderForNormalization(fieldName, reader); char[] buffer = new char[64]; StringBuilder builder = new StringBuilder(); final AttributeFactory attributeFactory = attributeFactory(fieldName); try (TokenStream ts = normalize(fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) { final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); ts.reset(); if (ts.incrementToken() == false) { throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " if (ts.incrementToken()) { throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\""); ts.end(); return term;
tokReader = new StringReader(field.stringValue()); tokens = analyzer.reusableTokenStream(field.name(), tokReader); if (position > 0) position += analyzer.getPositionIncrementGap(field.name()); tokens.reset(); // reset the TokenStream to the first token offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); .addAttribute(PositionIncrementAttribute.class); CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class); while (tokens.incrementToken()) position += (posIncrAttribute.getPositionIncrement() - 1); offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset());
Reader reader = new StringReader("This is a test string"); TokenStream tokenizer = new StandardTokenizer(Version.LUCENE_36, reader); tokenizer = new ShingleFilter(tokenizer, 1, 3); CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); while (tokenizer.incrementToken()) { String token = charTermAttribute.toString(); //Do something }
public Iterator<AToken> parseDocumentField(String fieldName, String content) { final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content)); final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class); final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);
try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) { stream.reset(); invertState.setAttributeSource(stream); termsHashPerField.start(field, first); while (stream.incrementToken()) { int posIncr = invertState.posIncrAttribute.getPositionIncrement(); invertState.position += posIncr; if (invertState.position < invertState.lastPosition) { int startOffset = invertState.offset + invertState.offsetAttribute.startOffset(); int endOffset = invertState.offset + invertState.offsetAttribute.endOffset(); if (startOffset < invertState.lastStartOffset || endOffset < startOffset) { throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards " stream.end(); invertState.position += invertState.posIncrAttribute.getPositionIncrement(); invertState.offset += invertState.offsetAttribute.endOffset(); invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);
@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { if (!keywordAttr.isKeyword() && termAttribute.length() > length) termAttribute.setLength(length); return true; } else { return false; } } }
private Token getNextInputToken(Token token) throws IOException { if (!input.incrementToken()) return null; token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length()); token.setPositionIncrement(in_posIncrAtt.getPositionIncrement()); token.setFlags(in_flagsAtt.getFlags()); token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset()); token.setType(in_typeAtt.type()); token.setPayload(in_payloadAtt.getPayload()); return token; }
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
uniqueTerms = new CharArraySet(8, false); int outputTokenSize = 0; while (input.incrementToken()) { if (outputTokenSize > maxOutputTokenSize) { continue; final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); input.end(); inputEnded = true; offsetAtt.setOffset(0, offsetAtt.endOffset()); posLenAtt.setPositionLength(1); posIncrAtt.setPositionIncrement(1); typeAtt.setType("fingerprint"); termAttribute.setEmpty(); return false;
String[] parts; skipCounter = 0; while (input.incrementToken()) { final String text = new String(termAtt.buffer(), 0, termAtt.length()); if (text.isEmpty()) { return true; skipCounter += posIncrAttribute.getPositionIncrement(); } else { if (skipCounter != 0) { posIncrAttribute.setPositionIncrement(posIncrAttribute.getPositionIncrement() + skipCounter);
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); in.reset(); while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1"); in.end(); if (state != -1) { builder.setAccept(state, true);
try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt + ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")"); reuse.setLength(reuse.length() + 1); System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length); ts.end();
private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException { ArrayList<Data> results = new ArrayList<>(50); TokenStream ts = analyzer1.tokenStream("foo", text); ts.reset(); while (ts.incrementToken()) { Data data = new Data(); OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class); data.startOffset = offsetAttribute.startOffset(); data.endOffset = offsetAttribute.endOffset(); data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength(); data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString(); data.term = ts.getAttribute(CharTermAttribute.class).toString(); if (ts.getAttribute(KeywordAttribute.class) != null) data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword(); // System.out.println(data.term + " " + data.tokenType); results.add(data); } ts.close(); return results; } }
@Override public boolean incrementToken() throws IOException { while (!exhausted && input.incrementToken()) { char[] term = termAttribute.buffer(); int termLength = termAttribute.length(); lastEndOffset = offsetAttribute.endOffset();
/** * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List<Term> currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { add(q, currentQuery, operator); currentQuery.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); } add(q, currentQuery, operator); return q.build(); }