/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ public CompoundToken(int offset, int length) { this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word this.startOffset = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); this.endOffset = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); }
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ public CompoundToken(int offset, int length) { this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word this.startOffset = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); this.endOffset = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); }
private void addSubtokensToStack(char separatorChar, String newTokenType) { char[] termBuffer = termAttribute.buffer(); int termLength = termAttribute.length(); int offset = 0; // We iterate over the array, trying to find the separatorChar ('.' or ',') for (int index = 0; index <= termLength; index++) { // Note that we actually iterate past the last character in the array. At this point index == termLength. // We must check for this condition first to stop ArrayIndexOutOfBoundsException. // Being at the end of the array is a subtoken border just like the separator character ('.'), except we don't want to // add a duplicate token if no separator was already found. Hence we also check for offset > 0. if ((index < termLength && termBuffer[index] == separatorChar) || (index == termLength && offset > 0)) { int subtokenLength = index - offset; // Check that this is not an "empty" subtoken if (subtokenLength > 0) { if (subtokenStack.isEmpty()) { nextType = newTokenType; current = captureState(); } subtokenStack.add(termAttribute.subSequence(offset, subtokenLength + offset)); } offset = index + 1; } } }
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ public CompoundToken(int offset, int length) { this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. this.startOffset = startOff; this.endOffset = endOff; } else { final int newStart = startOff + offset; this.startOffset = newStart; this.endOffset = newStart + length; } }
/** Construct the compound token based on a slice of the current {@link Lucene43CompoundWordTokenFilterBase#termAtt}. */ public CompoundToken(int offset, int length) { this.txt = Lucene43CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word int startOff = Lucene43CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); int endOff = Lucene43CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); if (endOff - startOff != Lucene43CompoundWordTokenFilterBase.this.termAtt.length()) { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. this.startOffset = startOff; this.endOffset = endOff; } else { final int newStart = startOff + offset; this.startOffset = newStart; this.endOffset = newStart + length; } }
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. * * @param offset the initial offset * @param length the token length * */ public CompoundToken(int offset, int length) { this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. this.startOffset = startOff; this.endOffset = endOff; } else { final int newStart = startOff + offset; this.startOffset = newStart; this.endOffset = newStart + length; } }
/** * Default Constructor. * * @param queryString Query String to analyse * @param fieldName Name of field to query * @param analyzer Lucene Analyzer that creates tokens/terms * @param subQueryCreator subQuery to use to perform the query */ QueryCreator(String queryString, final String fieldName, final Analyzer analyzer, final SubQuery subQueryCreator) { this.fieldName = notNull("You must provide a field name", fieldName); notNull("You must provide a Analyzer", analyzer); this.subQueryCreator = notNull("subQueryCreator", subQueryCreator); queryString = (queryString == null ? "" : queryString); try { final TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(queryString)); CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { int termLength = termAttribute.length(); tokens.add(termAttribute.subSequence(0, termLength).toString()); } } catch (final IOException e) { // wont happen } }
/** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. * * @param offset the initial offset * @param length the token length * */ public CompoundToken(int offset, int length) { this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); // offsets of the original word int startOff = CompoundWordTokenFilterBase.this.offsetAtt.startOffset(); int endOff = CompoundWordTokenFilterBase.this.offsetAtt.endOffset(); if (endOff - startOff != CompoundWordTokenFilterBase.this.termAtt.length()) { // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. this.startOffset = startOff; this.endOffset = endOff; } else { final int newStart = startOff + offset; this.startOffset = newStart; this.endOffset = newStart + length; } }
termAtt.copyBuffer(curTermBuffer, start, end - start); termAtt.append(suffix); if ((curGramSize == curTermLength - curPos) && !seenSuffixes.add(termAtt.subSequence(0, termAtt.length()))) { curTermBuffer = null; continue; if (!seenInfixes.add(termAtt.subSequence(0, termAtt.length()))) { curGramSize = 0; continue;