/** Creates the {@link TokenStream} of n-grams from the given {@link Reader} and {@link AttributeFactory}. */ @Override public Tokenizer create(AttributeFactory factory) { return new NGramTokenizer(factory, minGramSize, maxGramSize); } }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); return false; consume(); gramSize = minGram; updateLastNonTokenChar(); final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; posIncAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1); offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length)); ++gramSize; return true;
NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) { super(factory); init(minGram, maxGram, edgesOnly); }
private static PhraseQuery build(String fieldName, String fieldValue, int gramSize) { Preconditions.checkArgument(fieldValue.length()>=gramSize); PhraseQuery.Builder builder = new PhraseQuery.Builder(); try (NGramTokenizer tokenizer = new NGramTokenizer(gramSize, gramSize)) { tokenizer.setReader(new StringReader(fieldValue.toLowerCase())); tokenizer.reset(); while (tokenizer.incrementToken()) { builder.add(new Term(fieldName, tokenizer.getAttribute(CharTermAttribute.class).toString())); } } catch (IOException e) { throw new RuntimeException(e); } return builder.build(); }
tokenizer = new NGramTokenizer(2, maxNGramLength); tokenizer.setReader(reader); tokenizer.reset(); tokenStream = new ClassicFilter(tokenizer); tokenStream = new LowerCaseFilter(tokenStream); tokenizer.end(); tokenizer.close(); } catch (final IOException ignore) {}
clearAttributes(); if (!started) { started = true; pos++; termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize); offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize)); return true;
@Override public final void end() throws IOException { super.end(); assert bufferStart <= bufferEnd; int endOffset = offset; for (int i = bufferStart; i < bufferEnd; ++i) { endOffset += Character.charCount(buffer[i]); } endOffset = correctOffset(endOffset); // set final offset offsetAtt.setOffset(endOffset, endOffset); }
if (null == ngram) { try { if ((null != tokenizer) && tokenizer.incrementToken()) { final CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); if (null != charTermAttribute) { ngram = charTermAttribute.toString();
@Override public void end() { // set final offset final int finalOffset = correctOffset(charsRead); this.offsetAtt.setOffset(finalOffset, finalOffset); }
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */ public NGramTokenizer create(Reader input) { return new NGramTokenizer(input, minGramSize, maxGramSize); } }
@Override public final boolean incrementToken() throws IOException { clearAttributes(); return false; consume(); gramSize = minGram; updateLastNonTokenChar(); final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) { consume(); gramSize = minGram; continue; posIncAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1); offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length)); ++gramSize; return true;
NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) { init(minGram, maxGram, edgesOnly); }
@Override public final void end() throws IOException { super.end(); assert bufferStart <= bufferEnd; int endOffset = offset; for (int i = bufferStart; i < bufferEnd; ++i) { endOffset += Character.charCount(buffer[i]); } endOffset = correctOffset(endOffset); // set final offset offsetAtt.setOffset(endOffset, endOffset); }
@Override protected Tokenizer create(Version version) { return new NGramTokenizer(); } },
/** * Creates NGramTokenizer with given min and max n-grams. * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) { super(factory, input); init(minGram, maxGram); }
@Override public Tokenizer create() { if (matcher == null) { return new NGramTokenizer(minGram, maxGram); } else { return new NGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }
/** * Creates NGramTokenizer with given min and max n-grams. * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(Reader input, int minGram, int maxGram) { super(input); init(minGram, maxGram); }
@Override public Tokenizer create() { if (matcher == null) { return new NGramTokenizer(minGram, maxGram); } else { return new NGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }
/** * Creates NGramTokenizer with given min and max n-grams. * @param source {@link AttributeSource} to use * @param input {@link Reader} holding the input to be tokenized * @param minGram the smallest n-gram to generate * @param maxGram the largest n-gram to generate */ public NGramTokenizer(AttributeSource source, Reader input, int minGram, int maxGram) { super(source, input); init(minGram, maxGram); }
@Override public Tokenizer create() { if (matcher == null) { return new NGramTokenizer(minGram, maxGram); } else { return new NGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }