if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; termAtt.append(nextWord); posAtt.setPartOfSpeech(pos); int end = idx+nextWord.length(); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(idx, end); typeAtt.setType("word"); return true;
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); }
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); }
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) { stream.reset(); invertState.setAttributeSource(stream); termsHashPerField.start(field, first); while (stream.incrementToken()) { int posIncr = invertState.posIncrAttribute.getPositionIncrement(); invertState.position += posIncr; if (invertState.position < invertState.lastPosition) { int startOffset = invertState.offset + invertState.offsetAttribute.startOffset(); int endOffset = invertState.offset + invertState.offsetAttribute.endOffset(); if (startOffset < invertState.lastStartOffset || endOffset < startOffset) { throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards " stream.end(); invertState.position += invertState.posIncrAttribute.getPositionIncrement(); invertState.offset += invertState.offsetAttribute.endOffset(); invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);
uniqueTerms = new CharArraySet(8, false); int outputTokenSize = 0; while (input.incrementToken()) { if (outputTokenSize > maxOutputTokenSize) { continue; final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); input.end(); inputEnded = true; offsetAtt.setOffset(0, offsetAtt.endOffset()); posLenAtt.setPositionLength(1); posIncrAtt.setPositionIncrement(1); typeAtt.setType("fingerprint"); termAttribute.setEmpty(); return false;
private Attributes[] parseText(String text) throws IOException { if (text == null || text.trim().equals("")) return new Attributes[0]; final List<Attributes> result = new LinkedList<>(); TokenStream ts = analyzer.tokenStream(BaseDocumentBuilder.FieldName.CONTENT_BODY, new StringReader(text)); OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.reset(); while (ts.incrementToken()) { result.add(new Attributes(charTermAttribute.toString(), offsetAttribute.startOffset(), offsetAttribute.endOffset())); } ts.end(); } finally { ts.close(); } return result.toArray(new Attributes[result.size()]); }
private Token getNextSuffixInputToken(Token token) throws IOException { if (!suffix.incrementToken()) return null; token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setPositionIncrement(posIncrAtt.getPositionIncrement()); token.setFlags(flagsAtt.getFlags()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); return token; }
/** * Copy the inner's stream attributes values to the main stream's ones. This filter * uses an inner stream, therefore it needs to be cleared so that other filters * have clean attributes data. Because of that, the attributes datatypeURI and * node have to saved in order to be restored after. */ private void copyInnerStreamAttributes() { // backup datatype and node path final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node()); final char[] dt = dtypeAtt.datatypeURI(); // clear attributes input.clearAttributes(); // copy inner attributes final int len = tokenTermAtt.length(); termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len); offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset()); posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement()); typeAtt.setType(tokenTypeAtt.type()); // TupleTokenizer handles the setting of tuple/cell values and the datatype URI // restore datatype and node nodeAtt.copyNode(nodePath); dtypeAtt.setDatatypeURI(dt); }
private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException { ArrayList<Data> results = new ArrayList<>(50); TokenStream ts = analyzer1.tokenStream("foo", text); ts.reset(); while (ts.incrementToken()) { Data data = new Data(); OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class); data.startOffset = offsetAttribute.startOffset(); data.endOffset = offsetAttribute.endOffset(); data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength(); data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString(); data.term = ts.getAttribute(CharTermAttribute.class).toString(); if (ts.getAttribute(KeywordAttribute.class) != null) data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword(); // System.out.println(data.term + " " + data.tokenType); results.add(data); } ts.close(); return results; } }
/** * Saves the existing attribute states */ private void saveState() { savedTermLength = termAttribute.length(); savedStartOffset = offsetAttribute.startOffset(); savedEndOffset = offsetAttribute.endOffset(); savedState = captureState(); if (savedTermBuffer.length < savedTermLength) { savedTermBuffer = new char[ArrayUtil.oversize(savedTermLength, Character.BYTES)]; } System.arraycopy(termAttribute.buffer(), 0, savedTermBuffer, 0, savedTermLength); }
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } tokenStream.end(); // We've exhausted the token stream so we should just highlight everything. return end; } } }
private void setAttribs(PendingToken tok) { clearAttributes(); this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1); this.termAtt.setEmpty(); this.termAtt.append(tok.str); this.offsetAtt.setOffset(tok.start, tok.end); }
tokReader = new StringReader(field.stringValue()); tokens = analyzer.reusableTokenStream(field.name(), tokReader); if (position > 0) position += analyzer.getPositionIncrementGap(field.name()); tokens.reset(); // reset the TokenStream to the first token offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); .addAttribute(PositionIncrementAttribute.class); position += (posIncrAttribute.getPositionIncrement() - 1); offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset());
builder.createState(); final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); int maxOffset = 0; while (in.incrementToken()) { int posInc = posIncAtt.getPositionIncrement(); if (preservePositionIncrements == false && posInc > 1) { posInc = 1; final int endPos = pos + posLengthAtt.getPositionLength(); maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); in.end(); int endPosInc = posIncAtt.getPositionIncrement(); if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) { endPosInc = 1; } else if (endPosInc > 0 && preservePositionIncrements==false) {
gramBuilder.append(tokenSeparator); gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); if (outputUnigrams) { posLenAtt.setPositionLength(builtGramSize); } else { posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));
@Override public void copyTo(AttributeImpl target) { if (target instanceof PackedTokenAttributeImpl) { final PackedTokenAttributeImpl to = (PackedTokenAttributeImpl) target; to.copyBuffer(buffer(), 0, length()); to.positionIncrement = positionIncrement; to.positionLength = positionLength; to.startOffset = startOffset; to.endOffset = endOffset; to.type = type; to.termFrequency = termFrequency; } else { super.copyTo(target); ((OffsetAttribute) target).setOffset(startOffset, endOffset); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PositionLengthAttribute) target).setPositionLength(positionLength); ((TypeAttribute) target).setType(type); ((TermFrequencyAttribute) target).setTermFrequency(termFrequency); } }
if (input.incrementToken() == false) { return false; return true; int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); accumPosInc += posIncAttribute.getPositionIncrement(); posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; return true; endOffset = Math.max(endOffset, lastStartOffset); offsetAttribute.setOffset(startOffset, endOffset); lastStartOffset = startOffset; termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); } else { termAttribute.copyBuffer(termPart, 0, termPart.length); posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos); accumPosInc = 0; posLenAttribute.setPositionLength(endPos - startPos); wordPos = startPos; return true;
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; this.copyTo(target.attSource); if (posIncrAtt.getPositionIncrement() > 1) { numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; input.end(); endState = captureState(); numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(getAttributeFactory()); nextInputStreamToken.addAttribute(CharTermAttribute.class); OffsetAttribute newOffsetAtt = nextInputStreamToken.addAttribute(OffsetAttribute.class); newOffsetAtt.setOffset(offsetAtt.endOffset(), offsetAtt.endOffset());
@Override protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData(); currentPosition += posIncrAttribute.getPositionIncrement(); final int position = currentPosition; final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() {