Refine search
final Terms terms = fields.terms(field); if (terms == null) { continue; final boolean hasFreqs = terms.hasFreqs(); final boolean hasPositions = terms.hasPositions(); final boolean hasPayloads = terms.hasPayloads(); assert bb.isValid(); while(true) { postings = termsEnum.postings(postings, PostingsEnum.ALL); final Terms fieldTerms = fields.terms(field); if (fieldTerms == null) { final long v = fields.terms(field).getSumTotalTermFreq(); if (v != -1 && sumTotalTermFreq != v) { throw new RuntimeException("sumTotalTermFreq for field " + field + "=" + v + " != recomputed sumTotalTermFreq=" + sumTotalTermFreq); final long v = fields.terms(field).getSumDocFreq(); if (v != -1 && sumDocFreq != v) { throw new RuntimeException("sumDocFreq for field " + field + "=" + v + " != recomputed sumDocFreq=" + sumDocFreq); termCount = fields.terms(field).size(); int fieldCount = fields.size();
int numFields = vectors.size(); if (numFields == -1) { for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) { it.next(); numFields++; lastFieldName = fieldName; final Terms terms = vectors.terms(fieldName); if (terms == null) { final boolean hasPositions = terms.hasPositions(); final boolean hasOffsets = terms.hasOffsets(); final boolean hasPayloads = terms.hasPayloads(); assert !hasPayloads || hasPositions; termsEnum = terms.iterator(); while(termsEnum.next() != null) { numTerms++; while(termsEnum.next() != null) { termCount++; final int freq = (int) termsEnum.totalTermFreq(); startTerm(termsEnum.term(), freq);
private Map<String, Integer> distinctTermsCount(@Name("label") String label, @Name("key") String key) { try { SortedIndexReader sortedIndexReader = getSortedIndexReader(label, key, 0, Sort.INDEXORDER); Fields fields = MultiFields.getFields(sortedIndexReader.getIndexSearcher().getIndexReader()); Map<String, Integer> values = new HashMap<>(); TermsEnum termsEnum; Terms terms = fields.terms("string"); if (terms != null) { termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { values.put(termsEnum.term().utf8ToString(), termsEnum.docFreq()); } } return values; } catch (Exception e) { throw new RuntimeException("Error collecting distinct terms of label: " + label + " and key: " + key, e); } }
public void listTokens(int freq) throws IOException { IndexReader ireader = null; TermsEnum iter = null; Terms terms; try { ireader = DirectoryReader.open(indexDirectory); int numDocs = ireader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(ireader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.DEFS); iter = terms.iterator(); // init uid iterator } while (iter != null && iter.term() != null) { //if (iter.term().field().startsWith("f")) { if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) { LOGGER.warning(iter.term().utf8ToString()); } BytesRef next = iter.next(); if (next==null) {iter=null;} } } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { LOGGER.log(Level.WARNING, "An error occurred while closing index reader", e); } } } }
int numDocs = reader.numDocs(); if (numDocs > 0) { terms = uFields.terms(QueryBuilder.U); uidIter = terms.iterator(); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid if (stat == TermsEnum.SeekStatus.END) { uidIter = null; while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { reader.close();
if (deleteTerm.field().equals(lastField) == false) { lastField = deleteTerm.field(); Terms terms = fields.terms(lastField); if (terms != null) { termsEnum = terms.iterator(); } else { termsEnum = null; if (termsEnum != null && termsEnum.seekExact(deleteTerm.bytes())) { postingsEnum = termsEnum.postings(postingsEnum, 0); int delDocLimit = segDeletes.get(deleteTerm); assert delDocLimit < PostingsEnum.NO_MORE_DOCS;
private void createTermsEnum(Term t) throws IOException { String fieldName = t.field(); // TODO: get atomic sub readers and iterate values from those /* From: http://lucene.apache.org/core/4_0_0-BETA/MIGRATE.html Note that the MultiFields approach entails a performance hit on MultiReaders, as it must merge terms/docs/positions on the fly. It's generally better to instead get the sequential readers (use oal.util.ReaderUtil) and then step through those readers yourself, if you can (this is how Lucene drives searches). */ Fields fields = MultiFields.getFields(eval.getSearcher().getIndexReader()); if (fields != null) { Terms fieldTerms = fields.terms(fieldName); if (fieldTerms != null) { terms = fieldTerms.iterator(null); if (t != null) { if (terms.seekCeil(new BytesRef(t.text().getBytes("utf-8"))) != TermsEnum.SeekStatus.END) { next = terms.term().utf8ToString(); } } } } }
Terms terms = fields.terms(fieldName); Terms topLevelTerms = topLevelFields.terms(fieldName); ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size())); TermsEnum termsEnum = terms.iterator(); TermsEnum topLevelTermsEnum = topLevelTerms.iterator(); while (termsEnum.next() != null) { BytesRef termBytesRef = termsEnum.term(); boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef); assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!"; if (isNoise(term.bytes().utf8ToString(), freq)) { continue; queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
Term term = queryTerms[i]; TermContext termContext = contextArray[i]; final Terms terms = fields.terms(term.field()); if (terms == null) { termsEnum = terms.iterator(); assert termsEnum != null; if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { contextArray[i] = new TermContext(reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else {
Terms terms = tfv.terms(field); TermsEnum termsEnum = terms.iterator(); final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; final boolean postingsHasPayload = fieldInfo.hasPayloads(); final boolean vectorsHasPayload = terms.hasPayloads(); Terms postingsTerms = postingsFields.terms(field); if (postingsTerms == null) { throw new RuntimeException("vector field=" + field + " does not exist in postings; doc=" + j); TermsEnum postingsTermsEnum = postingsTerms.iterator(); final boolean hasProx = terms.hasOffsets() || terms.hasPositions(); BytesRef term = null; while ((term = termsEnum.next()) != null) { postings = termsEnum.postings(postings, PostingsEnum.ALL); assert postings != null; if (!postingsTermsEnum.seekExact(term)) { throw new RuntimeException("vector term=" + term + " field=" + field + " does not exist in postings; doc=" + j); if (!payload.equals(postingsPayload)) { throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
Terms fieldTermVector = termVectorsByField.terms(field); Terms topLevelTerms = topLevelFields.terms(field); TermsEnum topLevelIterator = topLevelTerms.iterator(); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads(); TermsEnum iterator = fieldTermVector.iterator(); final boolean useDocsAndPos = positions || offsets || payloads; while (iterator.next() != null) { // iterate all terms of the current field BytesRef termBytesRef = iterator.term(); Term term = new Term(field, termBytesRef); writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics); } else { boolean foundTerm = topLevelIterator.seekExact(termBytesRef); if (foundTerm) { writeTermStatistics(topLevelIterator);
protected int[] lookupDocIdByPK( final IndexSearcher searcher, final String... ids ) throws IOException { final List<AtomicReaderContext> subReaders = searcher.getIndexReader().leaves(); final TermsEnum[] termsEnums = new TermsEnum[ subReaders.size() ]; final DocsEnum[] docsEnums = new DocsEnum[ subReaders.size() ]; for ( int subIDX = 0; subIDX < subReaders.size(); subIDX++ ) { termsEnums[ subIDX ] = subReaders.get( subIDX ).reader().fields().terms( "id" ).iterator( null ); final BytesRef id = new BytesRef( ids[ idx ] ); for ( int subIDX = 0; subIDX < subReaders.size(); subIDX++ ) { final AtomicReader sub = subReaders.get( subIDX ).reader(); final TermsEnum termsEnum = termsEnums[ subIDX ]; if ( termsEnum.seekExact( id, false ) ) { final DocsEnum docs = docsEnums[ subIDX ] = termsEnum.docs( sub.getLiveDocs(), docsEnums[ subIDX ], 0 ); if ( docs != null ) { final int docID = docs.nextDoc();
private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException { String fieldName = fieldIter.next(); builder.startObject(fieldName); Terms curTerms = theFields.terms(fieldName); // write field statistics buildFieldStatistics(builder, curTerms); builder.startObject(FieldStrings.TERMS); TermsEnum termIter = curTerms.iterator(); BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class); for (int i = 0; i < curTerms.size(); i++) { buildTerm(builder, spare, curTerms, termIter, boostAtt); } builder.endObject(); builder.endObject(); }
private void runQuery(String query, int expectedPosition) throws ParseException, IOException { HebrewQueryParser hqp = new HebrewQueryParser("Text", analyzer); Query q = hqp.parse(query); TopDocs td = searcher.search(q, searcher.getIndexReader().maxDoc()); int num = td.scoreDocs[0].doc; Terms terms = searcher.getIndexReader().getTermVectors(num).terms("Text"); Set<Term> trms_list = new HashSet<>(); searcher.createWeight(q,true, 1.0f).extractTerms(trms_list); // q.extractTerms(trms_list); for (Term t : trms_list) { TermsEnum termsEnum = terms.iterator(); boolean isFound = termsEnum.seekExact(t.bytes()); Assert.assertTrue(isFound); PostingsEnum dpEnum = termsEnum.postings(null); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int pos = dpEnum.nextPosition(); //assertEquals(expectedPosition, dpEnum.startOffset()); //assertEquals(??, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); assertEquals(pos, expectedPosition); } } }
private Query buildFilterClause(LeafReader reader) throws IOException { Terms terms = reader.fields().terms(field); if (terms == null) return null; BooleanQuery.Builder bq = new BooleanQuery.Builder(); int docsInBatch = reader.maxDoc(); BytesRef term; TermsEnum te = terms.iterator(); while ((term = te.next()) != null) { // we need to check that every document in the batch has the same field values, otherwise // this filtering will not work if (te.docFreq() != docsInBatch) throw new IllegalArgumentException("Some documents in this batch do not have a term value of " + field + ":" + Term.toString(term)); bq.add(new TermQuery(new Term(field, BytesRef.deepCopyOf(term))), BooleanClause.Occur.SHOULD); } BooleanQuery built = bq.build(); if (built.clauses().size() == 0) return null; return built; }
for ( LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves() ) TermsEnum terms = LuceneDocumentStructure.originalTerms( fields.terms( field ), field ); BytesRef termsRef; while ( (termsRef = terms.next()) != null ) if ( terms.docFreq() > 1 ) collector.init( terms.docFreq() ); searcher.search( new TermQuery( new Term( field, termsRef ) ), collector );
TermsEnum termsEnum = terms.iterator(); tiq = new TermStatsQueue(numTerms, comparator); tiq.fill(field, termsEnum); } else { Fields fields = MultiFields.getFields(reader); if (fields.size() == 0) { throw new RuntimeException("no fields found for this index"); Terms terms = fields.terms(fieldName); if (terms != null) { tiq.fill(fieldName, terms.iterator());
/** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use. * @param values The list of values to load. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, List<String> values, Term term) throws IOException { Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), term.bytes()); BytesRef val; while ((val = prefixes.next()) != null) { values.add(val.utf8ToString()); } }
public Set<FieldTermCount> getFieldTermCounts() throws IOException { Set<FieldTermCount> termCounts = new TreeSet<FieldTermCount>(); numTerms = 0; Fields fields = MultiFields.getFields(reader); if (fields != null) { Iterator<String> fe = fields.iterator(); String fld; TermsEnum te = null; while (fe.hasNext()) { fld = fe.next(); long termCount = 0L; Terms terms = fields.terms(fld); if (terms != null) { te = terms.iterator(te); while (te.next() != null) { termCount++; numTerms++; } } termCounts.add(new FieldTermCount(fld, termCount)); } } return termCounts; }