@Override public Tokenizer create(AttributeFactory factory) { return new ThaiTokenizer(factory); } }
@Override protected boolean incrementWord() { int start = wordBreaker.current(); if (start == BreakIterator.DONE) { return false; // BreakIterator exhausted } // find the next set of boundaries, skipping over non-tokens int end = wordBreaker.next(); while (end != BreakIterator.DONE && !Character.isLetterOrDigit(Character.codePointAt(buffer, sentenceStart + start, sentenceEnd))) { start = end; end = wordBreaker.next(); } if (end == BreakIterator.DONE) { return false; // BreakIterator exhausted } clearAttributes(); termAtt.copyBuffer(buffer, sentenceStart + start, end - start); offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end)); return true; } }
public void reset(Reader input) throws IOException { assert input != null; try { this.tokenizer = new ThaiTokenizer(); tokenizer.setReader(input); this.term = tokenizer.addAttribute(CharTermAttribute.class); this.tokenizer.reset(); } catch (Exception e) { throw ExceptionUtils.wrapAsRuntimeException(e); } }
public short nextToken() throws IOException { final boolean hasNextToken = tokenizer.incrementToken(); if (hasNextToken) { final char [] image = term.buffer(); final int length = term.length(); tempCharSequence.reset(image, 0, length); return ITokenizer.TT_TERM; } return ITokenizer.TT_EOF; }
/** * Creates * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from a {@link ThaiTokenizer} filtered with * {@link LowerCaseFilter}, {@link DecimalDigitFilter} and {@link StopFilter} */ @Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer source = new ThaiTokenizer(); TokenStream result = new LowerCaseFilter(source); result = new DecimalDigitFilter(result); result = new StopFilter(result, stopwords); return new TokenStreamComponents(source, result); }
@Override protected boolean incrementWord() { int start = wordBreaker.current(); if (start == BreakIterator.DONE) { return false; // BreakIterator exhausted } // find the next set of boundaries, skipping over non-tokens int end = wordBreaker.next(); while (end != BreakIterator.DONE && !Character.isLetterOrDigit(Character.codePointAt(buffer, sentenceStart + start, sentenceEnd))) { start = end; end = wordBreaker.next(); } if (end == BreakIterator.DONE) { return false; // BreakIterator exhausted } clearAttributes(); termAtt.copyBuffer(buffer, sentenceStart + start, end - start); offsetAtt.setOffset(correctOffset(offset + sentenceStart + start), correctOffset(offset + sentenceStart + end)); return true; } }
@Override public Tokenizer create() { return new ThaiTokenizer(); } }
@Override public Tokenizer create(AttributeFactory factory) { return new ThaiTokenizer(factory); } }
@Override public Tokenizer create() { return new ThaiTokenizer(); } }
@Override protected Tokenizer create(Version version) { return new ThaiTokenizer(); } }
@Override public Tokenizer create() { return new ThaiTokenizer(); } }
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { ThaiTokenizer source = new ThaiTokenizer(reader); TokenStream result = new StandardFilter(source); return new TokenStreamComponents(source, result); } }
protected TokenStreamComponents createComponents(String fieldName) { if (getVersion().onOrAfter(Version.LUCENE_4_8_0)) { final Tokenizer source = new ThaiTokenizer(); TokenStream result = new LowerCaseFilter(source); if (getVersion().onOrAfter(Version.LUCENE_5_4_0)) {