try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) { stream.reset(); invertState.setAttributeSource(stream); termsHashPerField.start(field, first); while (stream.incrementToken()) { int posIncr = invertState.posIncrAttribute.getPositionIncrement(); invertState.position += posIncr; if (invertState.position < invertState.lastPosition) { int startOffset = invertState.offset + invertState.offsetAttribute.startOffset(); int endOffset = invertState.offset + invertState.offsetAttribute.endOffset(); if (startOffset < invertState.lastStartOffset || endOffset < startOffset) { throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards " stream.end(); invertState.position += invertState.posIncrAttribute.getPositionIncrement(); invertState.offset += invertState.offsetAttribute.endOffset(); invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);
String[] parts; skipCounter = 0; while (input.incrementToken()) { final String text = new String(termAtt.buffer(), 0, termAtt.length()); if (text.isEmpty()) { return true; skipCounter += posIncrAttribute.getPositionIncrement(); } else { if (skipCounter != 0) { posIncrAttribute.setPositionIncrement(posIncrAttribute.getPositionIncrement() + skipCounter);
/** * {@inheritDoc} */ @Override public void end() throws IOException { super.end(); posIncrAttribute.setPositionIncrement(posIncrAttribute.getPositionIncrement() + skipCounter); }
@Override public final void end() throws IOException { super.end(); // set final offset int finalOffset = correctOffset(this.endPosition); offsetAtt.setOffset(finalOffset, finalOffset); posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); } }
@Override public final boolean incrementToken() throws IOException { skippedPositions = 0; while (input.incrementToken()) { if (accept()) { if (skippedPositions != 0) { posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); } return true; } skippedPositions += posIncrAtt.getPositionIncrement(); } // reached EOS -- return false return false; }
uniqueTerms = new CharArraySet(8, false); int outputTokenSize = 0; while (input.incrementToken()) { if (outputTokenSize > maxOutputTokenSize) { continue; final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); input.end(); inputEnded = true; offsetAtt.setOffset(0, offsetAtt.endOffset()); posLenAtt.setPositionLength(1); posIncrAtt.setPositionIncrement(1); typeAtt.setType("fingerprint"); termAttribute.setEmpty(); return false;
private Token getNextSuffixInputToken(Token token) throws IOException { if (!suffix.incrementToken()) return null; token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setPositionIncrement(posIncrAtt.getPositionIncrement()); token.setFlags(flagsAtt.getFlags()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setPayload(payloadAtt.getPayload()); return token; }
try (TokenStream source = analyzer.tokenStream(field, termStr)) { source.reset(); List<BytesRef> currentPos = new ArrayList<>(); CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posAtt = source.addAttribute(PositionIncrementAttribute.class); boolean hasMoreTokens = source.incrementToken(); while (hasMoreTokens) { if (currentPos.isEmpty() == false && posAtt.getPositionIncrement() > 0) { tlist.add(currentPos); currentPos = new ArrayList<>(); final BytesRef term = analyzer.normalize(field, termAtt.toString()); currentPos.add(term); hasMoreTokens = source.incrementToken();
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException { Collection<Token> result = new ArrayList<Token>(); assert analyzer != null; TokenStream ts = analyzer.tokenStream("", q); try { ts.reset(); // TODO: support custom attributes CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class); FlagsAttribute flagsAtt = ts.addAttribute(FlagsAttribute.class); PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); while (ts.incrementToken()){ Token token = new Token(); token.copyBuffer(termAtt.buffer(), 0, termAtt.length()); token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset()); token.setType(typeAtt.type()); token.setFlags(flagsAtt.getFlags()); token.setPayload(payloadAtt.getPayload()); token.setPositionIncrement(posIncAtt.getPositionIncrement()); result.add(token); } ts.end(); return result; } finally { IOUtils.closeWhileHandlingException(ts); } }
/** * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List<Term> currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { add(q, currentQuery, operator); currentQuery.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); } add(q, currentQuery, operator); return q.build(); }
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); in.reset(); int state = -1; while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1"); int endPos = pos + posLengthAtt.getPositionLength(); while (state < endPos) { state = builder.createState();
/** * Copy the inner's stream attributes values to the main stream's ones. This filter * uses an inner stream, therefore it needs to be cleared so that other filters * have clean attributes data. Because of that, the attributes datatypeURI and * node have to saved in order to be restored after. */ private void copyInnerStreamAttributes() { // backup datatype and node path final IntsRef nodePath = IntsRef.deepCopyOf(nodeAtt.node()); final char[] dt = dtypeAtt.datatypeURI(); // clear attributes input.clearAttributes(); // copy inner attributes final int len = tokenTermAtt.length(); termAtt.copyBuffer(tokenTermAtt.buffer(), 0, len); offsetAtt.setOffset(tokenOffsetAtt.startOffset(), tokenOffsetAtt.endOffset()); posIncrAtt.setPositionIncrement(tokenPosIncrAtt.getPositionIncrement()); typeAtt.setType(tokenTypeAtt.type()); // TupleTokenizer handles the setting of tuple/cell values and the datatype URI // restore datatype and node nodeAtt.copyNode(nodePath); dtypeAtt.setDatatypeURI(dt); }
try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt + ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")"); reuse.setLength(reuse.length() + 1); System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length);
private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException { ArrayList<Data> results = new ArrayList<>(50); TokenStream ts = analyzer1.tokenStream("foo", text); ts.reset(); while (ts.incrementToken()) { Data data = new Data(); OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class); data.startOffset = offsetAttribute.startOffset(); data.endOffset = offsetAttribute.endOffset(); data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength(); data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString(); data.term = ts.getAttribute(CharTermAttribute.class).toString(); if (ts.getAttribute(KeywordAttribute.class) != null) data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword(); // System.out.println(data.term + " " + data.tokenType); results.add(data); } ts.close(); return results; } }
public static List<Span> generatePayload(String attributeName, String fieldValue, Analyzer luceneAnalyzer) { List<Span> payload = new ArrayList<>(); try { TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(fieldValue)); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class); int tokenPositionCounter = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { tokenPositionCounter += positionIncrementAttribute.getPositionIncrement(); int tokenPosition = tokenPositionCounter; int charStart = offsetAttribute.startOffset(); int charEnd = offsetAttribute.endOffset(); String analyzedTermStr = charTermAttribute.toString(); String originalTermStr = fieldValue.substring(charStart, charEnd); payload.add(new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition)); } tokenStream.close(); } catch (IOException e) { throw new DataflowException(e); } return payload; }
tokReader = new StringReader(field.stringValue()); tokens = analyzer.reusableTokenStream(field.name(), tokReader); if (position > 0) position += analyzer.getPositionIncrementGap(field.name()); tokens.reset(); // reset the TokenStream to the first token offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); .addAttribute(PositionIncrementAttribute.class); position += (posIncrAttribute.getPositionIncrement() - 1); offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset());
@Override public void copyTo(AttributeImpl target) { if (target instanceof PackedTokenAttributeImpl) { final PackedTokenAttributeImpl to = (PackedTokenAttributeImpl) target; to.copyBuffer(buffer(), 0, length()); to.positionIncrement = positionIncrement; to.positionLength = positionLength; to.startOffset = startOffset; to.endOffset = endOffset; to.type = type; to.termFrequency = termFrequency; } else { super.copyTo(target); ((OffsetAttribute) target).setOffset(startOffset, endOffset); ((PositionIncrementAttribute) target).setPositionIncrement(positionIncrement); ((PositionLengthAttribute) target).setPositionLength(positionLength); ((TypeAttribute) target).setType(type); ((TermFrequencyAttribute) target).setTermFrequency(termFrequency); } }
gramBuilder.append(tokenSeparator); gramBuilder.append(nextToken.termAtt.buffer(), 0, nextToken.termAtt.length()); ++builtGramSize; posIncrAtt.setPositionIncrement(isOutputHere ? 0 : 1); termAtt.setEmpty().append(gramBuilder); if (gramSize.getValue() > 1) { typeAtt.setType(tokenType); noShingleOutput = false; offsetAtt.setOffset(offsetAtt.startOffset(), nextToken.offsetAtt.endOffset()); if (outputUnigrams) { posLenAtt.setPositionLength(builtGramSize); } else { posLenAtt.setPositionLength(Math.max(1, (builtGramSize - minShingleSize) + 1));
private void setAttribs(PendingToken tok) { clearAttributes(); this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1); this.termAtt.setEmpty(); this.termAtt.append(tok.str); this.offsetAtt.setOffset(tok.start, tok.end); }
if (input.incrementToken() == false) { return false; return true; int termLength = termAttribute.length(); char[] termBuffer = termAttribute.buffer(); accumPosInc += posIncAttribute.getPositionIncrement(); posIncAttribute.setPositionIncrement(accumPosInc); accumPosInc = 0; return true; endOffset = Math.max(endOffset, lastStartOffset); offsetAttribute.setOffset(startOffset, endOffset); lastStartOffset = startOffset; termAttribute.copyBuffer(savedTermBuffer, startPart, endPart - startPart); } else { termAttribute.copyBuffer(termPart, 0, termPart.length); posIncAttribute.setPositionIncrement(accumPosInc + startPos - wordPos); accumPosInc = 0; posLenAttribute.setPositionLength(endPos - startPos); wordPos = startPos; return true;