@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; CharSequence text = CharBuffer.wrap(termAtt.termBuffer(), 0, termAtt.termLength()); Matcher m = p.matcher(text); if (all) { termAtt.setTermBuffer(m.replaceAll(replacement)); } else { termAtt.setTermBuffer(m.replaceFirst(replacement)); } return true; }
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { if (words.contains(termAtt.termBuffer(), 0, termAtt.termLength())) return true; } return false; } }
public final boolean incrementToken() throws IOException { if (input.incrementToken()) { int len = termAtt.termLength(); len++; termAtt.resizeTermBuffer(len); termAtt.termBuffer()[len - 1] = WILDCARD_OPERATOR; termAtt.setTermLength(len); return true; } else { return false; } }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StandardAnalyzer.STOP_WORDS_SET); TermAttribute termAtt = (TermAttribute) result.addAttribute(TermAttribute.class); StringBuilder buf = new StringBuilder(); try { while (result.incrementToken()) { if (termAtt.termLength() < 3) continue; String word = new String(termAtt.termBuffer(), 0, termAtt.termLength()); Matcher m = alphabets.matcher(word); if (m.matches()) { buf.append(word).append(" "); } } } catch (IOException e) { e.printStackTrace(); } return new WhitespaceTokenizer(new StringReader(buf.toString())); } }
public static void main(String[] args) throws IOException { FeatureVectorEncoder encoder = new StaticWordValueEncoder("text"); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); StringReader in = new StringReader("text to magically vectorize"); TokenStream ts = analyzer.tokenStream("body", in); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); Vector v1 = new RandomAccessSparseVector(100); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); String w = new String(termBuffer, 0, termLen); encoder.addToVector(w, 1, v1); } System.out.printf("%s\n", new SequentialAccessSparseVector(v1)); }
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; char[] termBuffer = termAtt.termBuffer(); int len = termAtt.termLength(); // if protected, don't stem. use this to avoid stemming collisions. if (protWords != null && protWords.contains(termBuffer, 0, len)) { return true; } stemmer.setCurrent(new String(termBuffer, 0, len));//ugh, wish the Stemmer took a char array stemmer.stem(); String newstr = stemmer.getCurrent(); termAtt.setTermBuffer(newstr.toCharArray(), 0, newstr.length()); return true; } }
@Override public TokenStream tokenStream(String fieldName, Reader reader) { final TokenStream result = new PorterStemFilter(new StopFilter( true, new StandardTokenizer(Version.LUCENE_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET)); TermAttribute termAtt = (TermAttribute) result .addAttribute(TermAttribute.class); StringBuilder buf = new StringBuilder(); try { while (result.incrementToken()) { String word = new String(termAtt.termBuffer(), 0, termAtt .termLength()); buf.append(filter.encode(word)).append(" "); } } catch (IOException e) { e.printStackTrace(); } return new WhitespaceTokenizer(new StringReader(buf.toString())); } }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); String termText = new String(termBuffer, 0, termAtt.termLength()); collator.getRawCollationKey(termText, reusableKey); ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); if (encodedLength > termBuffer.length) { termAtt.resizeTermBuffer(encodedLength); } termAtt.setTermLength(encodedLength); CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); return true; } else { return false; } } }
int termBufferLength = termAtt.termLength(); char[] backup = null; if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); String termText = new String(termBuffer, 0, termAtt.termLength()); byte[] collationKey = collator.getCollationKey(termText).toByteArray(); ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey); int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKeyBuf); if (encodedLength > termBuffer.length) { termAtt.resizeTermBuffer(encodedLength); } termAtt.setTermLength(encodedLength); CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer()); IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer); return true; } else { return false; } } }
private void setupToken() { scanner.getText(termAtt); final int start = scanner.yychar(); offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength())); }
int len = termAtt.termLength(); if (len==0) return true; // pass through zero length terms
if (termAtt.termLength()==0) return true;
int len = termAtt.termLength();
@Override public boolean incrementToken() throws IOException { if( save != null ) { // clearAttributes(); // not currently necessary restoreState(save); save = null; return true; } if (!input.incrementToken()) return false; // pass through zero-length terms int oldLen = termAtt.termLength(); if (oldLen ==0) return true; int origOffset = posAtt.getPositionIncrement(); if (withOriginal == true){ posAtt.setPositionIncrement(0); save = captureState(); } char [] buffer = termAtt.resizeTermBuffer(oldLen + 1); buffer[oldLen] = markerChar; //String reversed = reverseAndMark(value, markerChar); ReverseStringFilter.reverse(buffer, oldLen + 1); posAtt.setPositionIncrement(origOffset); termAtt.setTermBuffer(buffer, 0, oldLen +1); return true; }
PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); tokenStream.reset(); field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); tokenStream.reset(); field = new Field(f.name(), new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
tokenStream.close(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); if (value.startsWith(namePrefix)) {
tokenStream.close(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); if (value.startsWith(namePrefix)) {