org.apache.lucene.analysis.TokenStream.incrementToken java code examples

Refine search

 @Override
 public final boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
   CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
   return true;
  } else
   return false;
 }
}

 public final class LuceneUtil {

 private LuceneUtil() {}

 public static List<String> tokenizeString(Analyzer analyzer, String string) {
  List<String> result = new ArrayList<String>();
  try {
   TokenStream stream  = analyzer.tokenStream(null, new StringReader(string));
   stream.reset();
   while (stream.incrementToken()) {
    result.add(stream.getAttribute(CharTermAttribute.class).toString());
   }
  } catch (IOException e) {
   // not thrown b/c we're using a string reader...
   throw new RuntimeException(e);
  }
  return result;
 }

}

 public final class LuceneUtils {

  public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) {

    List<String> result = new ArrayList<String>();
    TokenStream stream  = analyzer.tokenStream(field, new StringReader(keywords));

    try {
      while(stream.incrementToken()) {
        result.add(stream.getAttribute(TermAttribute.class).term());
      }
    }
    catch(IOException e) {
      // not thrown b/c we're using a string reader...
    }

    return result;
  }  
}

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

  public static List<String> keywords( String source ) {
    List<String> keywords = new ArrayList<String>();
    TokenStream ts = null;
    try {
      ts = analyzer.tokenStream( "keywords", new StringReader( source ) );
      ts.reset();
      while ( ts.incrementToken() ) {
        keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() );
      }
      ts.end();
    }
    catch ( IOException e ) {
      logger.error( "Error getting keywords ", e );
    }
    finally {
      try {
         ts.close();
      } catch (IOException ignored) {}
    }
    return keywords;
  }
}

/** 
 * Creates simple boolean query from the cached tokenstream contents 
 */
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 
 stream.reset();
 List<Term> terms = new ArrayList<>();
 while (stream.incrementToken()) {
  terms.add(new Term(field, termAtt.getBytesRef()));
 }
 
 return newSynonymQuery(terms.toArray(new Term[terms.size()]));
}

 TokenStream stream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while (stream.incrementToken()) {
 System.out.println(cattr.toString());
}
stream.end();
stream.close();

private void fillCache() throws IOException {
 while (input.incrementToken()) {
  cache.add(captureState());
 }
 // capture final state
 input.end();
 finalState = captureState();
}

uniqueTerms = new CharArraySet(8, false);
int outputTokenSize = 0;
while (input.incrementToken()) {
 if (outputTokenSize > maxOutputTokenSize) {
  continue;
 final char term[] = termAttribute.buffer();
 final int length = termAttribute.length();
input.end();
inputEnded = true;

if (tokenIter == null || !tokenIter.hasNext()) {
  if (input.incrementToken()) {
    tokStart = offsetAtt.startOffset();
    tokEnd = offsetAtt.endOffset();
    hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;

final LinkedList<String> tokens = getTokens();
final CharTermAttribute termAtt = getTermAtt();
if (tokens.isEmpty() && input.incrementToken()) {
  final String text = new String(termAtt.buffer(), 0, termAtt.length());
  if (UrlStringUtils.containsUrl(text)) {
    final String[] parts = text.split("\\s");

try (TokenStream ts = analyzer.tokenStream("", text)) {
 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
 ts.reset();
 reuse.clear();
 while (ts.incrementToken()) {
  int length = termAtt.length();
  if (length == 0) {
   throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
   reuse.setLength(reuse.length() + 1);
  System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
  reuse.setLength(reuse.length() + length);
 ts.end();

/**
 * @param text
 * @return
 */
public Set<String> getToken(String text) {
  Set<String> list = new LinkedHashSet<>();
  if (CommonUtils.notEmpty(text)) {
    try (StringReader stringReader = new StringReader(text);
        TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) {
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        list.add(charTermAttribute.toString());
      }
      tokenStream.end();
      return list;
    } catch (IOException e) {
      return list;
    }
  }
  return list;
}

/**
 * Parses the query. Using this instead of a QueryParser in order
 * to avoid thread-safety issues with Lucene's query parser.
 *
 * @param fieldName the name of the field
 * @param value the value of the field
 * @return the parsed query
 */
private Query parseTokens(String fieldName, String value) {
 BooleanQuery searchQuery = new BooleanQuery();
 if (value != null) {
  Analyzer analyzer = new KeywordAnalyzer();
  try {
   TokenStream tokenStream =
    analyzer.tokenStream(fieldName, new StringReader(value));
   tokenStream.reset();
   CharTermAttribute attr =
    tokenStream.getAttribute(CharTermAttribute.class);
   while (tokenStream.incrementToken()) {
    String term = attr.toString();
    Query termQuery = new TermQuery(new Term(fieldName, term));
    searchQuery.add(termQuery, Occur.SHOULD);
   }
  } catch (IOException e) {
   throw new DukeException("Error parsing input string '" + value + "' " +
               "in field " + fieldName);
  }
 }
 return searchQuery;
}

/** 
 * Creates simple term query from the cached tokenstream contents 
 */
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
 TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
 
 stream.reset();
 if (!stream.incrementToken()) {
  throw new AssertionError();
 }
 
 return newTermQuery(new Term(field, termAtt.getBytesRef()));
}

 new StringTokenStream(attributeFactory, filteredText, text.length()))) {
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
ts.reset();
if (ts.incrementToken() == false) {
 throw new IllegalStateException("The normalization token stream is "
   + "expected to produce exactly 1 token, but got 0 for analyzer "
if (ts.incrementToken()) {
 throw new IllegalStateException("The normalization token stream is "
   + "expected to produce exactly 1 token, but got 2+ for analyzer "
   + this + " and input \"" + text + "\"");
ts.end();
return term;

@Override
public boolean incrementToken() throws IOException {
 while (sources[currentSource].incrementToken() == false) {
  if (currentSource >= sources.length - 1)
   return false;
  sources[currentSource].end();
  OffsetAttribute att = sourceOffsets[currentSource];
  if (att != null)
   offsetIncrement += att.endOffset();
  currentSource++;
 }
 clearAttributes();
 sources[currentSource].copyTo(this);
 offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement);
 return true;
}

boolean found = false;
while (input.incrementToken()) {
 found = true;
 String current = new String(termAttribute.buffer(), 0, termAttribute.length());
input.end();

  private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);
    while (ts.incrementToken()) {
     String s = ts.getAttribute(CharTermAttribute.class).toString();
     words.add(s);
    }
    /*overallCounts.addAll(words);*/
   }
}

 @Override
 public final boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
   if (!keywordAttr.isKeyword() && termAttribute.length() > length)
    termAttribute.setLength(length);
   return true;
  } else {
   return false;
  }
 }
}

Javadoc

Consumers (i.e., IndexWriter) use this method to advance the stream to the next token. Implementing classes must implement this method and update the appropriate AttributeImpls with the attributes of the next token.

The producer must make no assumptions about the attributes after the method has been returned: the caller may arbitrarily change it. If the producer needs to preserve the state for subsequent calls, it can use #captureState to create a copy of the current attribute state.

This method is called for every token of a document, so an efficient implementation is crucial for good performance. To avoid calls to #addAttribute(Class) and #getAttribute(Class), references to all AttributeImpls that this stream uses should be retrieved during instantiation.

To ensure that filters and consumers know which attributes are available, the attributes must be added during instantiation. Filters and consumers are not required to check for availability of attributes in #incrementToken().

Popular methods of TokenStream

reset
This method is called by a consumer before it begins consumption using #incrementToken(). Resets thi
close
Releases resources associated with this stream. If you override this method, always call super.close
addAttribute
end
This method is called by the consumer after the last token has been consumed, after #incrementToken(
getAttribute
hasAttribute
next
Returns the next token in the stream, or null at EOS. When possible, the input Token should be used
clearAttributes
reflectWith
assertFinal
endAttributes
getAttributeImplsIterator

Popular in Java

Making http post requests using okhttp
onCreateOptionsMenu (Activity)
compareTo (BigDecimal)
getContentResolver (Context)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
JButton (javax.swing)
Top plugins for Android Studio

How to use incrementTokenmethodin org.apache.lucene.analysis.TokenStream

Best Java code snippets using org.apache.lucene.analysis.TokenStream.incrementToken (Showing top 20 results out of 1,710)

Refine search

How to use
incrementToken
method
in
org.apache.lucene.analysis.TokenStream