void setTerm(String term, int startOffset, int endOffset, int position) { if (config.lowercase) { term = term.toLowerCase(); } if (config.trimWhitespace) { term = term.trim(); } //ignore empty term if(term.length()==0){ return; } termAtt.setEmpty(); termAtt.append(term); if (startOffset < 0) { startOffset = 0; } if (endOffset < startOffset) { endOffset = startOffset + term.length(); } int offset = position - lastIncrementPosition; if (offset < 0) { offset = 0; } positionAttr.setPositionIncrement(offset); lastIncrementPosition = position; }
return false; else { termAtt.setEmpty().append(buffer); offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd)); typeAtt.setType("sentence");
private void setAttribs(PendingToken tok) { clearAttributes(); this.posIncrAtt.setPositionIncrement(tok.nonpos ? 0 : 1); this.termAtt.setEmpty(); this.termAtt.append(tok.str); this.offsetAtt.setOffset(tok.start, tok.end); }
void setTerm(String term, int startOffset, int endOffset, int position) { if (config.lowercase) { term = term.toLowerCase(); } if (config.trimWhitespace) { term = term.trim(); } //ignore empty term if(term.length()==0){ return; } termAtt.setEmpty(); termAtt.append(term); if (startOffset < 0) { startOffset = 0; } if (endOffset < startOffset) { endOffset = startOffset + term.length(); } if(!config.ignorePinyinOffset){ offsetAtt.setOffset(correctOffset(startOffset), correctOffset(endOffset)); } int offset = position - lastIncrementPosition; if (offset < 0) { offset = 0; } positionAttr.setPositionIncrement(offset); lastIncrementPosition = position; }
/** * Clears, and then resets the instances attributes per the specified * arguments. * @param str the matched symbol * @param start the match start position * @param end the match end position */ protected void setAttribs(String str, int start, int end) { clearAttributes(); //FIXME increasing below by one(default) might be tricky, need more analysis // after lucene upgrade to 3.5 below is most probably not even needed this.posIncrAtt.setPositionIncrement(1); this.termAtt.setEmpty(); this.termAtt.append(str); this.offsetAtt.setOffset(start, end); } }
if (dot) { dot = false; termAtt.setEmpty(); termAtt.append(cdot); termAtt.setLength(1);
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; m.reset(); if (m.find()) { // replaceAll/replaceFirst will reset() this previous find. String transformed = all ? m.replaceAll(replacement) : m.replaceFirst(replacement); termAtt.setEmpty().append(transformed); } return true; }
/** * Creates a FixedShingleFilter over an input token stream * * @param input the input tokenstream * @param shingleSize the shingle size * @param tokenSeparator a String to use as a token separator * @param fillerToken a String to use to represent gaps in the input stream (due to eg stopwords) */ public FixedShingleFilter(TokenStream input, int shingleSize, String tokenSeparator, String fillerToken) { super(input); if (shingleSize <= 1 || shingleSize > MAX_SHINGLE_SIZE) { throw new IllegalArgumentException("Shingle size must be between 2 and " + MAX_SHINGLE_SIZE + ", got " + shingleSize); } this.shingleSize = shingleSize; this.tokenSeparator = tokenSeparator; this.gapToken.termAtt.setEmpty().append(fillerToken); this.currentShingleTokens = new Token[shingleSize]; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { final String term = termAtt.toString(); // Check the exclusion table. if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) { final String s = stemmer.stem(term); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.equals(term)) termAtt.setEmpty().append(s); } return true; } else { return false; } } }
/** * @return Returns true for next token in the stream, or false at EOS */ @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String term = termAtt.toString(); if (!keywordAttr.isKeyword()) { String s = stemmer.stem(term); // If not stemmed, don't waste the time adjusting the token. if ((s != null) && !s.equals(term)) termAtt.setEmpty().append(s); } return true; } else { return false; } }
termAtt.copyBuffer(termBuffer, start, (end - start)); } else { termAtt.setEmpty();
@Override public final boolean incrementToken() throws IOException { if (lemmas.size() > 0) { String l = lemmas.remove(); restoreState(current); termAttribute.setEmpty().append(l); posIncrAtt.setPositionIncrement(0); return true; } if (!input.incrementToken()) { return false; } if (addLemmas()) { current = captureState(); } return true; }
final int endIndex = matcher.end(group); if (index == endIndex) continue; termAtt.setEmpty().append(str, index, endIndex); offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex)); return true; if (matcher.start() - index > 0) { termAtt.setEmpty().append(str, index, matcher.start()); offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.start())); index = matcher.end(); termAtt.setEmpty().append(str, index, str.length()); offsetAtt.setOffset(correctOffset(index), correctOffset(str.length()));
@Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { assert current != null; CompoundToken token = tokens.removeFirst(); restoreState(current); // keep all other attributes untouched termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); posIncAtt.setPositionIncrement(0); return true; } current = null; // not really needed, but for safety if (input.incrementToken()) { // Only words longer than minWordSize get processed if (termAtt.length() >= this.minWordSize) { decompose(); // only capture the state if we really need it for producing new tokens if (!tokens.isEmpty()) { current = captureState(); } } // return original token: return true; } else { return false; } }
/** Returns the next, stemmed, input Token. * @return The stemmed form of a token. * @throws IOException If there is a low-level I/O error. */ @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; char[] term = termAttribute.buffer(); int len = termAttribute.length(); if ((!keywordAtt.isKeyword()) && stemmer.stem(term, len)) { termAttribute.setEmpty().append(stemmer.asCharSequence()); } return true; } }
@Override public boolean incrementToken() throws IOException { if (savedToken != null) { // Emit last token's type at the same position restoreState(savedToken); savedToken = null; termAtt.setEmpty(); if (prefix != null) { termAtt.append(prefix); } termAtt.append(typeAtt.type()); posIncrAtt.setPositionIncrement(0); return true; } else if (input.incrementToken()) { // Ho pending token type to emit savedToken = captureState(); return true; } return false; }
termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
@Override public boolean incrementToken() throws IOException { int posInc = 0; if (nextShingle() == false) { Token nextRoot = nextTokenInStream(currentShingleTokens[0]); if (nextRoot == endToken) return false; recycleToken(currentShingleTokens[0]); if (resetShingleRoot(nextRoot) == false) { return false; } posInc = currentShingleTokens[0].posInc(); } clearAttributes(); incAtt.setPositionIncrement(posInc); offsetAtt.setOffset(currentShingleTokens[0].startOffset(), lastTokenInShingle().endOffset()); termAtt.setEmpty(); termAtt.append(currentShingleTokens[0].term()); typeAtt.setType("shingle"); for (int i = 1; i < shingleSize; i++) { termAtt.append(tokenSeparator).append(currentShingleTokens[i].term()); } return true; }