String[] parts; skipCounter = 0; while (input.incrementToken()) { final String text = new String(termAtt.buffer(), 0, termAtt.length()); if (text.isEmpty()) { return true; skipCounter += posIncrAttribute.getPositionIncrement(); } else { if (skipCounter != 0) { posIncrAttribute.setPositionIncrement(posIncrAttribute.getPositionIncrement() + skipCounter);
private void setAttribs(PendingToken tok) { clearAttributes(); this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1); this.termAtt.setEmpty(); this.termAtt.append(tok.str); this.offsetAtt.setOffset(tok.start, tok.end); }
if (dot) { dot = false; termAtt.setEmpty(); termAtt.append(cdot); termAtt.setLength(1); offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + 1)); startPosition++; return true; termAtt.append(bufLcase); termAtt.setLength(i); offsetAtt.setOffset(correctOffset(startPosition), correctOffset(startPosition + i)); startPosition = startPosition + i + 1; return true;
@Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { CharacterUtils.toLowerCase(termAtt.buffer(), 0, termAtt.length()); return true; } else return false; } }
final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); input.end(); inputEnded = true; offsetAtt.setOffset(0, offsetAtt.endOffset()); posLenAtt.setPositionLength(1); posIncrAtt.setPositionIncrement(1); typeAtt.setType("fingerprint"); termAttribute.setEmpty(); return false; termAttribute.setEmpty(); uniqueTerms.clear(); return false; termAttribute.setEmpty().append(new String(clonedLastTerm)); uniqueTerms.clear(); return true; termAttribute.setEmpty().append(sb); uniqueTerms.clear(); return true;
private Token getNextInputToken(Token token) throws IOException { if (!input.incrementToken()) return null; token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length()); token.setPositionIncrement(in_posIncrAtt.getPositionIncrement()); token.setFlags(in_flagsAtt.getFlags()); token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset()); token.setType(in_typeAtt.type()); token.setPayload(in_payloadAtt.getPayload()); return token; }
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
if (tokenIter == null || !tokenIter.hasNext()) { if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; termAtt.append(nextWord); posAtt.setPartOfSpeech(pos); int end = idx+nextWord.length(); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(idx, end); typeAtt.setType("word"); return true;
/** Returns the next, stemmed, input Token. * @return The stemmed form of a token. * @throws IOException If there is a low-level I/O error. */ @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; char[] term = termAttribute.buffer(); int len = termAttribute.length(); if ((!keywordAtt.isKeyword()) && stemmer.stem(term, len)) { termAttribute.setEmpty().append(stemmer.asCharSequence()); } return true; } }
/** * Saves the existing attribute states */ private void saveState() { // otherwise, we have delimiters, save state savedStartOffset = offsetAttribute.startOffset(); savedEndOffset = offsetAttribute.endOffset(); // if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets. hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length()); savedType = typeAttribute.type(); if (savedBuffer.length < termAttribute.length()) { savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), Character.BYTES)]; } System.arraycopy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length()); iterator.text = savedBuffer; hasSavedState = true; }
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt + ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")"); reuse.setLength(reuse.length() + 1); System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length);
/** * Writes the concatenation to the attributes */ void write() { clearAttributes(); if (termAttribute.length() < buffer.length()) { termAttribute.resizeBuffer(buffer.length()); } char termbuffer[] = termAttribute.buffer(); buffer.getChars(0, buffer.length(), termbuffer, 0); termAttribute.setLength(buffer.length()); if (hasIllegalOffsets) { offsetAttribute.setOffset(savedStartOffset, savedEndOffset); } else { offsetAttribute.setOffset(startOffset, endOffset); } posIncAttribute.setPositionIncrement(position(true)); typeAttribute.setType(savedType); accumPosInc = 0; }
/** * Copy the inner's stream attributes values to the main stream's ones. This filter * uses an inner stream, therefore it needs to be cleared so that other filters * have clean attributes data. Because of that, the attributes datatypeURI and * node have to saved in order to be restored after. */ private void copyInnerStreamAttributes() { // backup datatype and node path final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node()); final char[] dt = dtypeAtt.datatypeURI(); // clear attributes input.clearAttributes(); // copy inner attributes final int len = tokenTermAtt.length(); termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len); offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset()); posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement()); typeAtt.setType(tokenTypeAtt.type()); // TupleTokenizer handles the setting of tuple/cell values and the datatype URI // restore datatype and node nodeAtt.copyNode(nodePath); dtypeAtt.setDatatypeURI(dt); }
gramBuilder.append(tokenSeparator); gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); if (outputUnigrams) { posLenAtt.setPositionLength(builtGramSize);
if (!input.incrementToken()) { return false; final char[] buffer = termAtt.buffer(); final int bufferLength = termAtt.length(); final String type = typeAtt.type(); (buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) { termAtt.setLength(bufferLength - 2); } else if (type == ACRONYM_TYPE) { // remove dots int upto = 0; buffer[upto++] = c; termAtt.setLength(upto);
if (input.incrementToken() == false) { return false; if (has(IGNORE_KEYWORDS) && keywordAttribute.isKeyword()) { return true; int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); accumPosInc += posIncAttribute.getPositionIncrement(); posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; return true; endOffset = Math.max(endOffset, lastStartOffset); offsetAttribute.setOffset(startOffset, endOffset); lastStartOffset = startOffset; termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); } else { termAttribute.copyBuffer(termPart, 0, termPart.length); posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos); accumPosInc = 0; posLenAttribute.setPositionLength(endPos - startPos);
newTarget.offsetAtt.setOffset(newTarget.offsetAtt.startOffset(), newTarget.offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; newTarget.isFiller = false; } else if (!exhausted) { if (input.incrementToken()) { if (null == target) { newTarget = new InputWindowToken(cloneAttributes()); this.copyTo(target.attSource); if (posIncrAtt.getPositionIncrement() > 1) { numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement() - 1, maxShingleSize - 1); newTarget.offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset()); newTarget.termAtt.copyBuffer(fillerToken, 0, fillerToken.length); newTarget.isFiller = true; --numFillerTokensToInsert; input.end(); endState = captureState(); numFillerTokensToInsert = Math.min(posIncrAtt.getPositionIncrement(), maxShingleSize - 1); if (numFillerTokensToInsert > 0) { nextInputStreamToken = new AttributeSource(getAttributeFactory());
@Override public boolean incrementToken() throws IOException { if (savedToken != null) { // Emit last token's type at the same position restoreState(savedToken); savedToken = null; termAtt.setEmpty(); if (prefix != null) { termAtt.append(prefix); } termAtt.append(typeAtt.type()); posIncrAtt.setPositionIncrement(0); return true; } else if (input.incrementToken()) { // Ho pending token type to emit savedToken = captureState(); return true; } return false; }
addSingleTerm = false; clearAttributes(); termAtt.append(previousWord); return true; } else if (input.incrementToken()) { final String word = new String(termAtt.buffer(), 0, termAtt.length()); if (word.isEmpty()) { return true; termAtt.append(word); previousWord = word; addSingleTerm = false; } else { clearAttributes(); termAtt.append(previousWord).append(word); previousWord = word; addSingleTerm = true;