/** * Convenience list which splits the remaining items in this iterator into a list of gram strings * * @return an immutable list of extracted grams */ public List<String> toExtractedList() { List<String> gramList = new ArrayList<>(); while (hasNext()) { gramList.add(next().extractFrom(input)); } return Collections.unmodifiableList(gramList); } }
@Override protected void doExecute(ExecutionContext ctx) { StringFieldValue input = (StringFieldValue)ctx.getValue(); SpanList spanList = input.setSpanTree(new SpanTree(SpanTrees.LINGUISTICS)).spanList(); int lastPosition = 0; for (Iterator<GramSplitter.Gram> it = linguistics.getGramSplitter().split(input.getString(), gramSize); it.hasNext();) { GramSplitter.Gram gram = it.next(); // if there is a gap before this gram, then annotate the gram as punctuation // (technically it may be of various types, but it does not matter - we just // need to annotate it somehow (as a non-term) to make sure it is added to the summary) if (lastPosition < gram.getStart()) { typedSpan(lastPosition, gram.getStart() - lastPosition, TokenType.PUNCTUATION, spanList); } // annotate gram as a word term String gramString = gram.extractFrom(input.getString()); typedSpan(gram.getStart(), gram.getLength(), TokenType.ALPHABETIC, spanList). annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); lastPosition = gram.getStart() + gram.getLength(); } // handle punctuation at the end if (lastPosition < input.toString().length()) { typedSpan(lastPosition, input.toString().length() - lastPosition, TokenType.PUNCTUATION, spanList); } }
/** * Splits the given item into n-grams and adds them as a CompositeItem containing WordItems searching the * index of the input term. If the result is a single gram, that single WordItem is returned rather than the AndItem * * @param term the term to split, must be an item which implement the IndexedItem and BlockItem "mixins" * @param text the text of the item, just stringValue() if the item is a TermItem * @param gramSize the gram size to split to * @param query the query in which this rewriting is done * @return the root of the query subtree produced by this, containing the split items */ protected Item splitToGrams(Item term, String text, int gramSize, Query query) { String index = ((HasIndexItem)term).getIndexName(); CompositeItem gramsItem = createGramRoot(query); gramsItem.setIndexName(index); Substring origin = ((BlockItem)term).getOrigin(); for (Iterator<GramSplitter.Gram> i = getGramSplitter().split(text,gramSize); i.hasNext(); ) { GramSplitter.Gram gram = i.next(); WordItem gramWord = new WordItem(gram.extractFrom(text), index, false, origin); gramWord.setWeight(term.getWeight()); gramWord.setProtected(true); gramsItem.addItem(gramWord); } return gramsItem.getItemCount()==1 ? gramsItem.getItem(0) : gramsItem; // return the AndItem, or just the single gram if not multiple }