org.apache.lucene.analysis.tokenattributes.OffsetAttribute java code examples

Refine search

if (tokenIter == null || !tokenIter.hasNext()) {
  if (input.incrementToken()) {
    tokStart = offsetAtt.startOffset();
    tokEnd = offsetAtt.endOffset();
    hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
termAtt.append(nextWord);
posAtt.setPartOfSpeech(pos);
int end = idx+nextWord.length();
if (hasIllegalOffsets) {
  offsetAtt.setOffset(tokStart, tokEnd);
} else {
  offsetAtt.setOffset(idx, end);
typeAtt.setType("word");
return true;

 TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

tokenStream.reset();
while (tokenStream.incrementToken()) {
  int startOffset = offsetAttribute.startOffset();
  int endOffset = offsetAttribute.endOffset();
  String term = charTermAttribute.toString();
}

 TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

while (tokenStream.incrementToken()) {
  int startOffset = offsetAttribute.startOffset();
  int endOffset = offsetAttribute.endOffset();
  String term = termAttribute.term();
}

private SToken[] getTokens(String text) throws IOException {
  //FIXME somehow integrate below cycle to getSummary to save the cloning and memory,
  //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter        
  ArrayList<SToken> result = new ArrayList<>();
  try (TokenStream ts = analyzer.tokenStream("full", text)) {
    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
      SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset());
      result.add(t);
    }
    ts.end();
  }
  return result.toArray(new SToken[result.size()]);
}

try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) {
 stream.reset();
 invertState.setAttributeSource(stream);
 termsHashPerField.start(field, first);
 while (stream.incrementToken()) {
  int posIncr = invertState.posIncrAttribute.getPositionIncrement();
  invertState.position += posIncr;
  if (invertState.position < invertState.lastPosition) {
  int startOffset = invertState.offset + invertState.offsetAttribute.startOffset();
  int endOffset = invertState.offset + invertState.offsetAttribute.endOffset();
  if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
   throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards "
 stream.end();
 invertState.position += invertState.posIncrAttribute.getPositionIncrement();
 invertState.offset += invertState.offsetAttribute.endOffset();
 invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name);
 invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);

uniqueTerms = new CharArraySet(8, false);
int outputTokenSize = 0;
while (input.incrementToken()) {
 if (outputTokenSize > maxOutputTokenSize) {
  continue;
 final char term[] = termAttribute.buffer();
 final int length = termAttribute.length();
input.end();
inputEnded = true;
offsetAtt.setOffset(0, offsetAtt.endOffset());
posLenAtt.setPositionLength(1);
posIncrAtt.setPositionIncrement(1);
typeAtt.setType("fingerprint");
 termAttribute.setEmpty();
 return false;

private Attributes[] parseText(String text) throws IOException {
  if (text == null || text.trim().equals(""))
    return new Attributes[0];
  final List<Attributes> result = new LinkedList<>();
  TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text));
  OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
  CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
  try {
    ts.reset();
    while (ts.incrementToken()) {
      result.add(new Attributes(charTermAttribute.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset()));
    }
    ts.end();
  } finally {
    ts.close();
  }
  return result.toArray(new Attributes[result.size()]);
}

private Token getNextSuffixInputToken(Token token) throws IOException {
 if (!suffix.incrementToken()) return null;
 token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
 token.setPositionIncrement(posIncrAtt.getPositionIncrement());
 token.setFlags(flagsAtt.getFlags());
 token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
 token.setType(typeAtt.type());
 token.setPayload(payloadAtt.getPayload());
 return token;
}

/**
 * Copy the inner's stream attributes values to the main stream's ones. This filter
 * uses an inner stream, therefore it needs to be cleared so that other filters
 * have clean attributes data. Because of that, the attributes datatypeURI and
 * node have to saved in order to be restored after.
 */
private void copyInnerStreamAttributes() {
 // backup datatype and node path
 final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node());
 final char[] dt = dtypeAtt.datatypeURI();
 // clear attributes
 input.clearAttributes();
 // copy inner attributes
 final int len = tokenTermAtt.length();
 termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len);
 offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset());
 posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement());
 typeAtt.setType(tokenTypeAtt.type());
 // TupleTokenizer handles the setting of tuple/cell values and the datatype URI
 // restore datatype and node
 nodeAtt.copyNode(nodePath);
 dtypeAtt.setDatatypeURI(dt);
}

  private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException {
    ArrayList<Data> results = new ArrayList<>(50);
    TokenStream ts = analyzer1.tokenStream("foo", text);
    ts.reset();
    while (ts.incrementToken()) {
      Data data = new Data();
      OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class);
      data.startOffset = offsetAttribute.startOffset();
      data.endOffset = offsetAttribute.endOffset();
      data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength();
      data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString();
      data.term = ts.getAttribute(CharTermAttribute.class).toString();

      if (ts.getAttribute(KeywordAttribute.class) != null)
        data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword();
      // System.out.println(data.term + " " + data.tokenType);
      results.add(data);
    }
    ts.close();

    return results;
  }
}

/**
 * Saves the existing attribute states
 */
private void saveState() {
 savedTermLength = termAttribute.length();
 savedStartOffset = offsetAttribute.startOffset();
 savedEndOffset = offsetAttribute.endOffset();
 savedState = captureState();
 if (savedTermBuffer.length < savedTermLength) {
  savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)];
 }
 System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength);
}

  private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents)
      throws IOException {
    try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) {
      if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
        // Can't split on term boundaries without offsets
        return -1;
      }
      int end = -1;
      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
        if (attr.endOffset() >= noMatchSize) {
          // Jump to the end of this token if it wouldn't put us past the boundary
          if (attr.endOffset() == noMatchSize) {
            end = noMatchSize;
          }
          return end;
        }
        end = attr.endOffset();
      }
      tokenStream.end();
      // We've exhausted the token stream so we should just highlight everything.
      return end;
    }
  }
}

private void setAttribs(PendingToken tok) {
  clearAttributes();
  this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1);
  this.termAtt.setEmpty();
  this.termAtt.append(tok.str);
  this.offsetAtt.setOffset(tok.start, tok.end);
}

    tokReader = new StringReader(field.stringValue());
  tokens = analyzer.reusableTokenStream(field.name(), tokReader);
if (position > 0)
  position += analyzer.getPositionIncrementGap(field.name());
tokens.reset(); // reset the TokenStream to the first token
  offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);
      .addAttribute(PositionIncrementAttribute.class);
    position += (posIncrAttribute.getPositionIncrement() - 1);
    offsetVector.add(lastOffset + offsetAttribute.startOffset());
    offsetVector.add(lastOffset + offsetAttribute.endOffset());

builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
int maxOffset = 0;
while (in.incrementToken()) {
 int posInc = posIncAtt.getPositionIncrement();
 if (preservePositionIncrements == false && posInc > 1) {
  posInc = 1;
 final int endPos = pos + posLengthAtt.getPositionLength();
 maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
in.end();
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
 endPosInc = 1;
} else if (endPosInc > 0 && preservePositionIncrements==false) {

  gramBuilder.append(tokenSeparator);
 gramBuilder.append(nextToken.termAtt.buffer(), 0, 
           nextToken.termAtt.length());
 ++builtGramSize;
posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1);
termAtt.setEmpty().append(gramBuilder);
if (gramSize.getValue() > 1) {
 typeAtt.setType(tokenType);
 noShingleOutput = false;
offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset());
if (outputUnigrams) {
 posLenAtt.setPositionLength(builtGramSize);
} else {
 posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));

@Override
public void copyTo(AttributeImpl target) {
 if (target instanceof PackedTokenAttributeImpl) {
  final PackedTokenAttributeImpl to = (PackedTokenAttributeImpl) target;
  to.copyBuffer(buffer(), 0, length());
  to.positionIncrement = positionIncrement;
  to.positionLength = positionLength;
  to.startOffset = startOffset;
  to.endOffset = endOffset;
  to.type = type;
  to.termFrequency = termFrequency;
 } else {
  super.copyTo(target);
  ((OffsetAttribute) target).setOffset(startOffset, endOffset);
  ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement);
  ((PositionLengthAttribute) target).setPositionLength(positionLength);
  ((TypeAttribute) target).setType(type);
  ((TermFrequencyAttribute) target).setTermFrequency(termFrequency);
 }
}

if (input.incrementToken() == false) {
 return false;
  return true;
int termLength = termAttribute.length();
char[] termBuffer = termAttribute.buffer();
accumPosInc += posIncAttribute.getPositionIncrement();
 posIncAttribute.setPositionIncrement(accumPosInc);
 accumPosInc = 0;
 return true;
endOffset = Math.max(endOffset, lastStartOffset);
offsetAttribute.setOffset(startOffset, endOffset);
lastStartOffset = startOffset;
 termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart);
} else {
 termAttribute.copyBuffer(termPart, 0, termPart.length);
posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos);
accumPosInc = 0;
posLenAttribute.setPositionLength(endPos - startPos);
wordPos = startPos;
return true;

newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), 
               newTarget.offsetAtt.startOffset());
newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
newTarget.isFiller = true;
--numFillerTokensToInsert;
  this.copyTo(target.attSource);
 if (posIncrAtt.getPositionIncrement() > 1) {
  numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1);
  newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
  newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length);
  newTarget.isFiller = true;
  --numFillerTokensToInsert;
 input.end();
 endState = captureState();
 numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1);
 if (numFillerTokensToInsert > 0) {
  nextInputStreamToken = new AttributeSource(getAttributeFactory());
  nextInputStreamToken.addAttribute(CharTermAttribute.class);
  OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class);
  newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());

@Override
protected AToken computeNext() {
  try {
    if (!tkstream.incrementToken()) {
      tkstream.end();
      tkstream.close();
      return endOfData();
  currentPosition += posIncrAttribute.getPositionIncrement();
  final int position = currentPosition;
  final int startOffset = offsetAtt.startOffset();
  final int endOffset = offsetAtt.endOffset();
  final String text = termAtt.term();
  return new AToken() {

Javadoc

The start and end character offset of a Token.

Most used methods

setOffset
Set the starting and ending offset.
startOffset
Returns this Token's starting offset, the position of the first character corresponding to this toke
endOffset
Returns this Token's ending offset, one greater than the position of the last character correspondin

Popular in Java

Creating JSON documents from java classes using gson
addToBackStack (FragmentTransaction)
runOnUiThread (Activity)
compareTo (BigDecimal)
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
JarFile (java.util.jar)
JarFile is used to read jar entries and their associated data from jar files.
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
IOUtils (org.apache.commons.io)
General IO stream manipulation utilities. This class provides static utility methods for input/outpu
Github Copilot alternatives

How to useOffsetAttribute in org.apache.lucene.analysis.tokenattributes

Best Java code snippets using org.apache.lucene.analysis.tokenattributes.OffsetAttribute (Showing top 20 results out of 981)

Refine search

How to use
OffsetAttribute
in
org.apache.lucene.analysis.tokenattributes