/** Retrieve term vector for this document and field, or * null if term vectors were not indexed. The returned * Fields instance acts like a single-document inverted * index (the docID will be 0). */ public final Terms getTermVector(int docID, String field) throws IOException { Fields vectors = getTermVectors(docID); if (vectors == null) { return null; } return vectors.terms(field); }
@Override public final Fields getTermVectors(int docID) throws IOException { ensureOpen(); final int i = readerIndex(docID); // find subreader num return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader }
Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) {
/** Retrieve term vector for this document and field, or * null if term vectors were not indexed. The returned * Fields instance acts like a single-document inverted * index (the docID will be 0). */ public final Terms getTermVector(int docID, String field) throws IOException { Fields vectors = getTermVectors(docID); if (vectors == null) { return null; } return vectors.terms(field); }
/** Retrieve term vector for this document and field, or * null if term vectors were not indexed. The returned * Fields instance acts like a single-document inverted * index (the docID will be 0). */ public final Terms getTermVector(int docID, String field) throws IOException { Fields vectors = getTermVectors(docID); if (vectors == null) { return null; } return vectors.terms(field); }
/** Retrieve term vector for this document and field, or * null if term vectors were not indexed. The returned * Fields instance acts like a single-document inverted * index (the docID will be 0). */ public final Terms getTermVector(int docID, String field) throws IOException { Fields vectors = getTermVectors(docID); if (vectors == null) { return null; } return vectors.terms(field); }
@Override public final Fields getTermVectors(int docID) throws IOException { ensureOpen(); final int i = readerIndex(docID); // find subreader num return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader }
private boolean tryExtractTermsFromTermVector(int docNum, String indexedField, IndexReader ir, StringBuilder sb) throws IOException { final Fields vectors = ir.getTermVectors(docNum); if(vectors != null){ if (vectors != null) { Terms vector = vectors.terms(indexedField); if(vector == null){ return false; } List<String> terms = TermExtractionHelper.getTermsFromTermVectorField(vector); for(String term: terms){ sb.append(term).append(DELIM); } return true; } } return false; }
@Override public final Fields getTermVectors(int docID) throws IOException { ensureOpen(); final int i = readerIndex(docID); // find subreader num return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader }
@Override public final Fields getTermVectors(int docID) throws IOException { ensureOpen(); final int i = readerIndex(docID); // find subreader num return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader }
public static HitEnum fromTermVectors(IndexReader reader, int docId, String fieldName, CompiledAutomaton acceptable, TermWeigher<BytesRef> queryWeigher, TermWeigher<BytesRef> corpusWeigher, TermSourceFinder<BytesRef> sourceFinder) throws IOException { Fields vectors = reader.getTermVectors(docId); if (vectors == null) { // No term vectors so no hits return EmptyHitEnum.INSTANCE; } return fromTerms(vectors.terms(fieldName), acceptable, -1, queryWeigher, corpusWeigher, sourceFinder); }
/** * checks that term vectors across all fields are equivalent */ public void assertTermVectorsEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException { assert leftReader.maxDoc() == rightReader.maxDoc(); for (int i = 0; i < leftReader.maxDoc(); i++) { Fields leftFields = leftReader.getTermVectors(i); Fields rightFields = rightReader.getTermVectors(i); assertFieldsEquals(info, leftReader, leftFields, rightFields, rarely()); } }
@Override public int intVal(int docNum) { try{ // SH: Fastest method to do this is if the field has term vectors stored // else we have to re-analyze the field, which is not efficient // http://stackoverflow.com/questions/3574106/how-to-count-the-number-of-terms-for-each-document-in-lucene-index final Fields vectors = ir.getTermVectors(docNum); if(vectors != null){ if (vectors != null) { Terms vector = vectors.terms(indexedField); if(vector != null) { return (int) vector.size(); } } } } catch(java.io.IOException ex){ throw new RuntimeException("caught exception in function " + description()+" while reading term vectors for doc : doc=" + docNum, ex); } return getFieldLengthFromAnalysisChain(docNum, indexedField, ir); } };
/** * A convenience method that tries a number of approaches to getting a token * stream. The cost of finding there are no termVectors in the index is * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?) * approach to coding is probably acceptable * * @return null if field not stored correctly * @throws IOException If there is a low-level I/O error */ @Deprecated // maintenance reasons LUCENE-6445 public static TokenStream getAnyTokenStream(IndexReader reader, int docId, String field, Analyzer analyzer) throws IOException { TokenStream ts = null; Fields vectors = reader.getTermVectors(docId); if (vectors != null) { Terms vector = vectors.terms(field); if (vector != null) { ts = getTokenStream(vector); } } // No token info stored so fall back to analyzing raw content if (ts == null) { ts = getTokenStream(reader, docId, field, analyzer); } return ts; }
Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) {
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) {
private void extractFeaturesFromDocument(int docNum, IndexReader ir, String featureLabel, Map<String, Map<String,Integer>> featureMap) throws IOException { if(fields == null || fields.length == 0){ return; } final Fields vectors = ir.getTermVectors(docNum); final Document document = ir.document(docNum); for (String fieldName : fields) { Terms vector = null; if (vectors != null) { vector = vectors.terms(fieldName); } // field does not store term vector info // even if term vectors enabled, need to extract payload from regular field reader if (vector == null) { IndexableField docFields[] = document.getFields(fieldName); for (IndexableField field : docFields) { final String stringValue = field.stringValue(); if (stringValue != null) { List<String> lstTerms = TermExtractionHelper.getTermsFromString(analyzer, fieldName, stringValue); Set<String> newTerms = new HashSet<String>(lstTerms); this.addFeaturesToMap(fieldName, featureLabel, newTerms, featureMap); } } } else { List<String> lstTerms = TermExtractionHelper.getTermsFromTermVectorField(vector); Set<String> newTerms = new HashSet<String>(lstTerms); this.addFeaturesToMap(fieldName, featureLabel, newTerms, featureMap); } } }
Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) {
@Override public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception { IndexReader reader = searcher.getIndexReader(); highlighter.setFragmentScorer(new QueryScorer(q)); // highlighter.setTextFragmenter(); unfortunately no sentence mechanism, not even regex. Default here is trivial for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) { Document document = reader.document(scoreDoc.doc, hlFields); Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null; for (IndexableField indexableField : document) { TokenStream tokenStream; if (termVecs) { tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields, indexableField.stringValue(), analyzer, maxDocCharsToAnalyze); } else { tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue()); } // will close TokenStream: String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags); preventOptimizeAway = fragments.length; } } } }
private void runQuery(String query, int expectedPosition) throws ParseException, IOException { HebrewQueryParser hqp = new HebrewQueryParser("Text", analyzer); Query q = hqp.parse(query); TopDocs td = searcher.search(q, searcher.getIndexReader().maxDoc()); int num = td.scoreDocs[0].doc; Terms terms = searcher.getIndexReader().getTermVectors(num).terms("Text"); Set<Term> trms_list = new HashSet<>(); searcher.createWeight(q,true, 1.0f).extractTerms(trms_list); // q.extractTerms(trms_list); for (Term t : trms_list) { TermsEnum termsEnum = terms.iterator(); boolean isFound = termsEnum.seekExact(t.bytes()); Assert.assertTrue(isFound); PostingsEnum dpEnum = termsEnum.postings(null); assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int pos = dpEnum.nextPosition(); //assertEquals(expectedPosition, dpEnum.startOffset()); //assertEquals(??, dpEnum.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc()); assertEquals(pos, expectedPosition); } } }