org.apache.lucene.analysis.tokenattributes.CharTermAttribute java code examples

Refine search

String[] parts;
skipCounter = 0;
while (input.incrementToken()) {
  final String text = new String(termAtt.buffer(), 0, termAtt.length());
  if (text.isEmpty()) {
    return true;
    skipCounter += posIncrAttribute.getPositionIncrement();
  } else {
    if (skipCounter != 0) {
      posIncrAttribute.setPositionIncrement(posIncrAttribute.getPositionIncrement() + skipCounter);

private void setAttribs(PendingToken tok) {
  clearAttributes();
  this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1);
  this.termAtt.setEmpty();
  this.termAtt.append(tok.str);
  this.offsetAtt.setOffset(tok.start, tok.end);
}

if (dot) {
  dot = false;
  termAtt.setEmpty();
  termAtt.append(cdot);
  termAtt.setLength(1);
  offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + 1));
  startPosition++;
  return true;
termAtt.append(bufLcase);
termAtt.setLength(i);
offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + i));
startPosition = startPosition + i + 1;
return true;

/**
 * Returns the next input Token whose term() is not a stop word.
 */
@Override
protected boolean accept() {
 return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
}

 @Override
 public final boolean incrementToken() throws IOException {
  if (input.incrementToken()) {
   CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length());
   return true;
  } else
   return false;
 }
}

 final char term[] = termAttribute.buffer();
 final int length = termAttribute.length();
input.end();
inputEnded = true;
offsetAtt.setOffset(0, offsetAtt.endOffset());
posLenAtt.setPositionLength(1);
posIncrAtt.setPositionIncrement(1);
typeAtt.setType("fingerprint");
 termAttribute.setEmpty();
 return false;
 termAttribute.setEmpty();
 uniqueTerms.clear();
 return false;
 termAttribute.setEmpty().append(new String(clonedLastTerm));
 uniqueTerms.clear();
 return true;
termAttribute.setEmpty().append(sb);
uniqueTerms.clear();
return true;

private Token getNextInputToken(Token token) throws IOException {
 if (!input.incrementToken()) return null;
 token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length());
 token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
 token.setFlags(in_flagsAtt.getFlags());
 token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
 token.setType(in_typeAtt.type());
 token.setPayload(in_payloadAtt.getPayload());
 return token;
}

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

if (tokenIter == null || !tokenIter.hasNext()) {
  if (input.incrementToken()) {
    tokStart = offsetAtt.startOffset();
    tokEnd = offsetAtt.endOffset();
    hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
termAtt.append(nextWord);
posAtt.setPartOfSpeech(pos);
int end = idx+nextWord.length();
if (hasIllegalOffsets) {
  offsetAtt.setOffset(tokStart, tokEnd);
} else {
  offsetAtt.setOffset(idx, end);
typeAtt.setType("word");
return true;

 /** Returns the next, stemmed, input Token.
  *  @return The stemmed form of a token.
  *  @throws IOException If there is a low-level I/O error.
  */
 @Override
 public boolean incrementToken() throws IOException {
  if (!input.incrementToken())
   return false;

  char[] term = termAttribute.buffer();
  int len = termAttribute.length();
  if ((!keywordAtt.isKeyword()) && stemmer.stem(term, len)) {
   termAttribute.setEmpty().append(stemmer.asCharSequence());
  }

  return true;
 }
}

/**
 * Saves the existing attribute states
 */
private void saveState() {
 // otherwise, we have delimiters, save state
 savedStartOffset = offsetAttribute.startOffset();
 savedEndOffset = offsetAttribute.endOffset();
 // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
 hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
 savedType = typeAttribute.type();
 if (savedBuffer.length < termAttribute.length()) {
  savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), Character.BYTES)];
 }
 System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
 iterator.text = savedBuffer;
 hasSavedState = true;
}

CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
reuse.clear();
while (ts.incrementToken()) {
 int length = termAtt.length();
 if (length == 0) {
  throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
 if (posIncAtt.getPositionIncrement() != 1) {
  throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt +
                    ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")");
  reuse.setLength(reuse.length() + 1);
 System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length);
 reuse.setLength(reuse.length() + length);

/**
 * Writes the concatenation to the attributes
 */
void write() {
 clearAttributes();
 if (termAttribute.length() < buffer.length()) {
  termAttribute.resizeBuffer(buffer.length());
 }
 char termbuffer[] = termAttribute.buffer();
 
 buffer.getChars(0, buffer.length(), termbuffer, 0);
 termAttribute.setLength(buffer.length());
  
 if (hasIllegalOffsets) {
  offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
 }
 else {
  offsetAttribute.setOffset(startOffset, endOffset);
 }
 posIncAttribute.setPositionIncrement(position(true));
 typeAttribute.setType(savedType);
 accumPosInc = 0;
}

/**
 * Copy the inner's stream attributes values to the main stream's ones. This filter
 * uses an inner stream, therefore it needs to be cleared so that other filters
 * have clean attributes data. Because of that, the attributes datatypeURI and
 * node have to saved in order to be restored after.
 */
private void copyInnerStreamAttributes() {
 // backup datatype and node path
 final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node());
 final char[] dt = dtypeAtt.datatypeURI();
 // clear attributes
 input.clearAttributes();
 // copy inner attributes
 final int len = tokenTermAtt.length();
 termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len);
 offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset());
 posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement());
 typeAtt.setType(tokenTypeAtt.type());
 // TupleTokenizer handles the setting of tuple/cell values and the datatype URI
 // restore datatype and node
 nodeAtt.copyNode(nodePath);
 dtypeAtt.setDatatypeURI(dt);
}

  gramBuilder.append(tokenSeparator);
 gramBuilder.append(nextToken.termAtt.buffer(), 0, 
           nextToken.termAtt.length());
 ++builtGramSize;
posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
 typeAtt.setType(tokenType);
 noShingleOutput = false;
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
if (outputUnigrams) {
 posLenAtt.setPositionLength(builtGramSize);

if (!input.incrementToken()) {
 return false;
final char[] buffer = termAtt.buffer();
final int bufferLength = termAtt.length();
final String type = typeAtt.type();
  (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
 termAtt.setLength(bufferLength - 2);
} else if (type == ACRONYM_TYPE) {      // remove dots
 int upto = 0;
   buffer[upto++] = c;
 termAtt.setLength(upto);

if (input.incrementToken() == false) {
 return false;
if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) {
  return true;
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();
accumPosInc += posIncAttribute.getPositionIncrement();
 posIncAttribute.setPositionIncrement(accumPosInc);
 accumPosInc = 0;
 return true;
endOffset = Math.max(endOffset, lastStartOffset);
offsetAttribute.setOffset(startOffset, endOffset);
lastStartOffset = startOffset;
 termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart);
} else {
 termAttribute.copyBuffer(termPart, 0, termPart.length);
posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos);
accumPosInc = 0;
posLenAttribute.setPositionLength(endPos - startPos);

 newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), 
                newTarget.offsetAtt.startOffset());
 newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
 newTarget.isFiller = true;
 --numFillerTokensToInsert;
 newTarget.isFiller = false;
} else if (!exhausted) {
 if (input.incrementToken()) {
  if (null == target) {
   newTarget = new InputWindowToken(cloneAttributes());
   this.copyTo(target.attSource);
  if (posIncrAtt.getPositionIncrement() > 1) {
   numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
   newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
   newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
   newTarget.isFiller = true;
   --numFillerTokensToInsert;
  input.end();
  endState = captureState();
  numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
  if (numFillerTokensToInsert > 0) {
   nextInputStreamToken = new AttributeSource(getAttributeFactory());

@Override
public boolean incrementToken() throws IOException {
 if (savedToken != null) {         // Emit last token's type at the same position
  restoreState(savedToken);
  savedToken = null;
  termAtt.setEmpty();
  if (prefix != null) {
   termAtt.append(prefix);
  }
  termAtt.append(typeAtt.type());
  posIncrAtt.setPositionIncrement(0);
  return true;
 } else if (input.incrementToken()) { // Ho pending token type to emit
  savedToken = captureState();
  return true;
 }
 return false;
}

  addSingleTerm = false;
  clearAttributes();
  termAtt.append(previousWord);
  return true;
} else if (input.incrementToken()) {
  final String word = new String(termAtt.buffer(), 0, termAtt.length());
  if (word.isEmpty()) {
    return true;
    termAtt.append(word);
    previousWord = word;
    addSingleTerm = false;
  } else {
    clearAttributes();
    termAtt.append(previousWord).append(word);
    previousWord = word;
    addSingleTerm = true;

Javadoc

The term text of a Token.

Most used methods

length
append
Appends the contents of the other CharTermAttribute to this character sequence. The characters of th
buffer
Returns the internal termBuffer character array which you can then directly alter. If the array is t
setEmpty
Sets the length of the termBuffer to zero. Use this method before appending contents using the Appen
setLength
Set number of valid characters (length of the term) in the termBuffer array. Use this to truncate th
copyBuffer
Copies the contents of buffer, starting at offset for length characters, into the termBuffer array.
resizeBuffer
Grows the termBuffer to at least size newSize, preserving the existing content.
charAt
subSequence
toString

Popular in Java

Making http requests using okhttp
getSupportFragmentManager (FragmentActivity)
notifyDataSetChanged (ArrayAdapter)
getExternalFilesDir (Context)
PrintStream (java.io)
Fake signature of an existing Java class.
URI (java.net)
A Uniform Resource Identifier that identifies an abstract or physical resource, as specified by RFC
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Map (java.util)
A Map is a data structure consisting of a set of keys and values in which each key is mapped to a si
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
Top plugins for Android Studio

How to useCharTermAttribute in org.apache.lucene.analysis.tokenattributes

Best Java code snippets using org.apache.lucene.analysis.tokenattributes.CharTermAttribute (Showing top 20 results out of 1,125)

Refine search

How to use
CharTermAttribute
in
org.apache.lucene.analysis.tokenattributes