org.apache.lucene.analysis.TokenStream java code examples

Refine search

 public final class LuceneUtil {

 private LuceneUtil() {}

 public static List<String> tokenizeString(Analyzer analyzer, String string) {
  List<String> result = new ArrayList<String>();
  try {
   TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
   stream.reset();
   while (stream.incrementToken()) {
    result.add(stream.getAttribute(CharTermAttribute.class).toString());
   }
  } catch (IOException e) {
   // not thrown b/c we're using a string reader...
   throw new RuntimeException(e);
  }
  return result;
 }

}

 TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
 System.out.println(cattr.toString());
}
stream.end();
stream.close();

 TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

tokenStream.reset();
while (tokenStream.incrementToken()) {
  int startOffset = offsetAttribute.startOffset();
  int endOffset = offsetAttribute.endOffset();
  String term = charTermAttribute.toString();
}

 TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

while (tokenStream.incrementToken()) {
  int startOffset = offsetAttribute.startOffset();
  int endOffset = offsetAttribute.endOffset();
  String term = termAttribute.term();
}

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

try (Reader reader = new StringReader(text)) {
 Reader filterReader = initReaderForNormalization(fieldName, reader);
 char[] buffer = new char[64];
 StringBuilder builder = new StringBuilder();
final AttributeFactory attributeFactory = attributeFactory(fieldName);
try (TokenStream ts = normalize(fieldName,
  new StringTokenStream(attributeFactory, filteredText, text.length()))) {
 final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
 ts.reset();
 if (ts.incrementToken() == false) {
  throw new IllegalStateException("The normalization token stream is "
    + "expected to produce exactly 1 token, but got 0 for analyzer "
 if (ts.incrementToken()) {
  throw new IllegalStateException("The normalization token stream is "
    + "expected to produce exactly 1 token, but got 2+ for analyzer "
    + this + " and input \"" + text + "\"");
 ts.end();
 return term;

    tokReader = new StringReader(field.stringValue());
  tokens = analyzer.reusableTokenStream(field.name(), tokReader);
if (position > 0)
  position += analyzer.getPositionIncrementGap(field.name());
tokens.reset(); // reset the TokenStream to the first token
  offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
      .addAttribute(PositionIncrementAttribute.class);
CharTermAttribute termAttribute = (CharTermAttribute) tokens.addAttribute(CharTermAttribute.class);
while (tokens.incrementToken())
    position += (posIncrAttribute.getPositionIncrement() - 1);
    offsetVector.add(lastOffset + offsetAttribute.startOffset());
    offsetVector.add(lastOffset + offsetAttribute.endOffset());

 Reader reader = new StringReader("This is a test string");
TokenStream tokenizer = new StandardTokenizer(Version.LUCENE_36, reader);
tokenizer = new ShingleFilter(tokenizer, 1, 3);
CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);

while (tokenizer.incrementToken()) {
  String token = charTermAttribute.toString();
  //Do something
}

public Iterator<AToken> parseDocumentField(String fieldName, String content) {
  final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content));
  final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class);
  final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class);
  final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);

try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {
 stream.reset();
 invertState.setAttributeSource(stream);
 termsHashPerField.start(field, first);
 while (stream.incrementToken()) {
  int posIncr = invertState.posIncrAttribute.getPositionIncrement();
  invertState.position += posIncr;
  if (invertState.position < invertState.lastPosition) {
  int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
  int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
  if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
   throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
 stream.end();
 invertState.position += invertState.posIncrAttribute.getPositionIncrement();
 invertState.offset += invertState.offsetAttribute.endOffset();
 invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
 invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);

 @Override
 public final boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
   if (!keywordAttr.isKeyword() && termAttribute.length() > length)
    termAttribute.setLength(length);
   return true;
  } else {
   return false;
  }
 }
}

private Token getNextInputToken(Token token) throws IOException {
 if (!input.incrementToken()) return null;
 token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length());
 token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
 token.setFlags(in_flagsAtt.getFlags());
 token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
 token.setType(in_typeAtt.type());
 token.setPayload(in_payloadAtt.getPayload());
 return token;
}

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

uniqueTerms = new CharArraySet(8, false);
int outputTokenSize = 0;
while (input.incrementToken()) {
 if (outputTokenSize > maxOutputTokenSize) {
  continue;
 final char term[] = termAttribute.buffer();
 final int length = termAttribute.length();
input.end();
inputEnded = true;
offsetAtt.setOffset(0, offsetAtt.endOffset());
posLenAtt.setPositionLength(1);
posIncrAtt.setPositionIncrement(1);
typeAtt.setType("fingerprint");
 termAttribute.setEmpty();
 return false;

String[] parts;
skipCounter = 0;
while (input.incrementToken()) {
  final String text = new String(termAtt.buffer(), 0, termAtt.length());
  if (text.isEmpty()) {
    return true;
    skipCounter += posIncrAttribute.getPositionIncrement();
  } else {
    if (skipCounter != 0) {
      posIncrAttribute.setPositionIncrement(posIncrAttribute.getPositionIncrement() + skipCounter);

final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
in.reset();
while (in.incrementToken()) {
 int currentIncr = posIncAtt.getPositionIncrement();
 if (pos == -1 && currentIncr < 1) {
  throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
in.end();
if (state != -1) {
 builder.setAccept(state, true);

try (TokenStream ts = analyzer.tokenStream("", text)) {
 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
 ts.reset();
 reuse.clear();
 while (ts.incrementToken()) {
  int length = termAtt.length();
  if (length == 0) {
   throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
  if (posIncAtt.getPositionIncrement() != 1) {
   throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                     ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
   reuse.setLength(reuse.length() + 1);
  System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
  reuse.setLength(reuse.length() + length);
 ts.end();

  private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException {
    ArrayList<Data> results = new ArrayList<>(50);
    TokenStream ts = analyzer1.tokenStream("foo", text);
    ts.reset();
    while (ts.incrementToken()) {
      Data data = new Data();
      OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class);
      data.startOffset = offsetAttribute.startOffset();
      data.endOffset = offsetAttribute.endOffset();
      data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength();
      data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString();
      data.term = ts.getAttribute(CharTermAttribute.class).toString();

      if (ts.getAttribute(KeywordAttribute.class) != null)
        data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword();
      // System.out.println(data.term + " " + data.tokenType);
      results.add(data);
    }
    ts.close();

    return results;
  }
}

@Override
public boolean incrementToken() throws IOException {
 while (!exhausted && input.incrementToken()) {
  char[] term = termAttribute.buffer();
  int termLength = termAttribute.length();
  lastEndOffset = offsetAttribute.endOffset();

/** 
 * Creates complex boolean query from the cached tokenstream contents 
 */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
 BooleanQuery.Builder q = newBooleanQuery();
 List<Term> currentQuery = new ArrayList<>();
 
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
 
 stream.reset();
 while (stream.incrementToken()) {
  if (posIncrAtt.getPositionIncrement() != 0) {
   add(q, currentQuery, operator);
   currentQuery.clear();
  }
  currentQuery.add(new Term(field, termAtt.getBytesRef()));
 }
 add(q, currentQuery, operator);
 
 return q.build();
}

Javadoc

A TokenStream enumerates the sequence of tokens, either from Fields of a Document or from query text.

This is an abstract class; concrete subclasses are:

Tokenizer, a TokenStream whose input is a Reader; and
TokenFilter, a TokenStream whose input is another TokenStream.

A new TokenStream API has been introduced with Lucene 2.9. This API has moved from being Token-based to Attribute-based. While Token still exists in 2.9 as a convenience class, the preferred way to store the information of a Token is to use AttributeImpls.

TokenStream now extends AttributeSource, which provides access to all of the token Attributes for the TokenStream. Note that only one instance per AttributeImpl is created and reused for every token. This approach reduces object creation and allows local caching of references to the AttributeImpls. See #incrementToken() for further details.

The workflow of the new TokenStream API is as follows:

Instantiation of TokenStream/ TokenFilters which add/get attributes to/from the AttributeSource.
The consumer calls TokenStream#reset().
The consumer retrieves attributes from the stream and stores local references to all attributes it wants to access.
The consumer calls #incrementToken() until it returns false consuming the attributes after each call.
The consumer calls #end() so that any end-of-stream operations can be performed.
The consumer calls #close() to release any resource when finished using the TokenStream.

To make sure that filters and consumers know which attributes are available, the attributes must be added during instantiation. Filters and consumers are not required to check for availability of attributes in #incrementToken().

You can find some example code for the new API in the analysis package level Javadoc.

Sometimes it is desirable to capture a current state of a TokenStream, e.g., for buffering purposes (see CachingTokenFilter, TeeSinkTokenFilter). For this usecase AttributeSource#captureState and AttributeSource#restoreStatecan be used.

The TokenStream-API in Lucene is based on the decorator param. Therefore all non-abstract subclasses must be final or have at least a final implementation of #incrementToken! This is checked when Java assertions are enabled.

Most used methods

incrementToken
Consumers (i.e., IndexWriter) use this method to advance the stream to the next token. Implementing
reset
This method is called by a consumer before it begins consumption using #incrementToken(). Resets thi
close
Releases resources associated with this stream. If you override this method, always call super.close
addAttribute
end
This method is called by the consumer after the last token has been consumed, after #incrementToken(
getAttribute
hasAttribute
next
Returns the next token in the stream, or null at EOS. When possible, the input Token should be used
clearAttributes
reflectWith
assertFinal
endAttributes

Popular in Java

Creating JSON documents from java classes using gson
scheduleAtFixedRate (Timer)
onRequestPermissionsResult (Fragment)
setScale (BigDecimal)
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
JPanel (javax.swing)
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Best plugins for Eclipse

How to useTokenStream in org.apache.lucene.analysis

Best Java code snippets using org.apache.lucene.analysis.TokenStream (Showing top 20 results out of 1,818)

Refine search

How to use
TokenStream
in
org.apache.lucene.analysis