@Override public int getStartOffset() { if(this.word instanceof OffsetAttribute) { return ((OffsetAttribute)this.word).startOffset(); } return -1; }
@Override public int getStartOffset() { if(this.word instanceof OffsetAttribute) { return ((OffsetAttribute)this.word).startOffset(); } return -1; }
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); }
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); }
void writeOffsets(int termID, int offsetAccum) { final int startOffset = offsetAccum + offsetAttribute.startOffset(); final int endOffset = offsetAccum + offsetAttribute.endOffset(); assert startOffset - freqProxPostingsArray.lastOffsets[termID] >= 0; writeVInt(1, startOffset - freqProxPostingsArray.lastOffsets[termID]); writeVInt(1, endOffset - startOffset); freqProxPostingsArray.lastOffsets[termID] = startOffset; }
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset();
tokens.add(termValue); String cutValue = input.substring(offs.startOffset(), offs.endOffset()); assertEquals("cut term" + (1 + count), cutValue, termValue);
/** * Helper method for {@link #testOffsetAttribute()} that runs the test on * one single implementation class with the specified input text and * expected tokens. */ private void testOffsetAttribute(Class<? extends JFlexSymbolMatcher> klass, String inputText, String[] expectedTokens) throws Exception { JFlexSymbolMatcher matcher = klass.getConstructor(Reader.class). newInstance(new StringReader(inputText)); JFlexTokenizer tokenizer = new JFlexTokenizer(matcher); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class); int count = 0; while (tokenizer.incrementToken()) { assertTrue("too many tokens", count < expectedTokens.length); String expected = expectedTokens[count]; assertEquals("term", expected, term.toString()); assertEquals("start", inputText.indexOf(expected), offset.startOffset()); assertEquals("end", inputText.indexOf(expected) + expected.length(), offset.endOffset()); count++; } assertEquals("wrong number of tokens", expectedTokens.length, count); }
/** * Helper method for {@link #testOffsetAttribute()} that runs the test on * one single implementation class with the specified input text and * expected tokens. */ private void testOffsetAttribute(Class<? extends JFlexSymbolMatcher> klass, String inputText, String[] expectedTokens) throws Exception { JFlexSymbolMatcher matcher = klass.getConstructor(Reader.class). newInstance(new StringReader(inputText)); JFlexTokenizer tokenizer = new JFlexTokenizer(matcher); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); OffsetAttribute offset = tokenizer.addAttribute(OffsetAttribute.class); int count = 0; while (tokenizer.incrementToken()) { assertTrue("too many tokens", count < expectedTokens.length); String expected = expectedTokens[count]; // 0-based offset to accord with String[] assertEquals("term" + count, expected, term.toString()); assertEquals("start" + count, inputText.indexOf(expected), offset.startOffset()); assertEquals("end" + count, inputText.indexOf(expected) + expected.length(), offset.endOffset()); count++; } assertEquals("wrong number of tokens", expectedTokens.length, count); }
offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset());
void writeProx(TermVectorsPostingsArray postings, int termID) { if (doVectorOffsets) { int startOffset = fieldState.offset + offsetAttribute.startOffset(); int endOffset = fieldState.offset + offsetAttribute.endOffset(); writeVInt(1, startOffset - postings.lastOffsets[termID]); writeVInt(1, endOffset - startOffset); postings.lastOffsets[termID] = endOffset; } if (doVectorPositions) { final BytesRef payload; if (payloadAttribute == null) { payload = null; } else { payload = payloadAttribute.getPayload(); } final int pos = fieldState.position - postings.lastPositions[termID]; if (payload != null && payload.length > 0) { writeVInt(0, (pos<<1)|1); writeVInt(0, payload.length); writeBytes(0, payload.bytes, payload.offset, payload.length); hasPayloads = true; } else { writeVInt(0, pos<<1); } postings.lastPositions[termID] = fieldState.position; } }
assertEquals("term", expected, term.toString()); assertEquals("start", inputText.indexOf(expected), offset.startOffset()); assertEquals("end", inputText.indexOf(expected) + expected.length(),
String termValue = term.toString(); String cutValue = source.substring(offs.startOffset(), offs.endOffset()); (offs.startOffset()) + "-" + (offs.endOffset()) + "[" + cutValue + "] vs [" + termValue + "]", cutContainsTerm);
int startOffset = invertState.offset + invertState.offsetAttribute.startOffset(); int endOffset = invertState.offset + invertState.offsetAttribute.endOffset(); if (startOffset < invertState.lastStartOffset || endOffset < startOffset) {
/** * Saves this information to form the left part of a gram */ private void saveTermBuffer() { buffer.setLength(0); buffer.append(termAttribute.buffer(), 0, termAttribute.length()); buffer.append(SEPARATOR); lastStartOffset = offsetAttribute.startOffset(); lastWasCommon = isCommon(); }
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ public CompoundToken(int offset, int length) { this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word this.startOffset = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); this.endOffset = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); }
@Override public void nextToken() { Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder()))); result.add(new Token(term, offsetAttr.startOffset(), offsetAttr.endOffset())); } }, spare);
private void finishInnerStream() throws IOException { input.end(); inputStreamExhausted = true; // check for gaps at the end of the tokenstream endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement()); OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class); endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset()); }
private void buffer() { if (bufferedLen == buffered.length) { int newSize = ArrayUtil.oversize(bufferedLen+1, 8); buffered = ArrayUtil.growExact(buffered, newSize); startOff = ArrayUtil.growExact(startOff, newSize); posInc = ArrayUtil.growExact(posInc, newSize); } startOff[bufferedLen] = offsetAttribute.startOffset(); posInc[bufferedLen] = posIncAttribute.getPositionIncrement(); buffered[bufferedLen] = captureState(); bufferedLen++; }