org.apache.lucene.index.IndexReader.getTermVectors java code examples

/** Retrieve term vector for this document and field, or
 *  null if term vectors were not indexed.  The returned
 *  Fields instance acts like a single-document inverted
 *  index (the docID will be 0). */
public final Terms getTermVector(int docID, String field)
 throws IOException {
 Fields vectors = getTermVectors(docID);
 if (vectors == null) {
  return null;
 }
 return vectors.terms(field);
}

@Override
public final Fields getTermVectors(int docID) throws IOException {
 ensureOpen();
 final int i = readerIndex(docID);        // find subreader num
 return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader
}

Map<String, Int> termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
  final Fields vectors = ir.getTermVectors(docNum);
  final Terms vector;
  if (vectors != null) {

/** Retrieve term vector for this document and field, or
 *  null if term vectors were not indexed.  The returned
 *  Fields instance acts like a single-document inverted
 *  index (the docID will be 0). */
public final Terms getTermVector(int docID, String field)
 throws IOException {
 Fields vectors = getTermVectors(docID);
 if (vectors == null) {
  return null;
 }
 return vectors.terms(field);
}

/** Retrieve term vector for this document and field, or
 *  null if term vectors were not indexed.  The returned
 *  Fields instance acts like a single-document inverted
 *  index (the docID will be 0). */
public final Terms getTermVector(int docID, String field)
 throws IOException {
 Fields vectors = getTermVectors(docID);
 if (vectors == null) {
  return null;
 }
 return vectors.terms(field);
}

/** Retrieve term vector for this document and field, or
 *  null if term vectors were not indexed.  The returned
 *  Fields instance acts like a single-document inverted
 *  index (the docID will be 0). */
public final Terms getTermVector(int docID, String field)
 throws IOException {
 Fields vectors = getTermVectors(docID);
 if (vectors == null) {
  return null;
 }
 return vectors.terms(field);
}

@Override
public final Fields getTermVectors(int docID) throws IOException {
 ensureOpen();
 final int i = readerIndex(docID);        // find subreader num
 return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader
}

private boolean tryExtractTermsFromTermVector(int docNum, String indexedField, IndexReader ir, StringBuilder sb) throws IOException {
  final Fields vectors = ir.getTermVectors(docNum);
  if(vectors != null){
    if (vectors != null) {
      Terms vector = vectors.terms(indexedField);
      if(vector == null){
        return false;
      }
      List<String> terms = TermExtractionHelper.getTermsFromTermVectorField(vector);
      for(String term: terms){
        sb.append(term).append(DELIM);
      }
      return true;
    }
  }
  return false;
}

@Override
public final Fields getTermVectors(int docID) throws IOException {
 ensureOpen();
 final int i = readerIndex(docID);        // find subreader num
 return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader
}

@Override
public final Fields getTermVectors(int docID) throws IOException {
 ensureOpen();
 final int i = readerIndex(docID);        // find subreader num
 return subReaders[i].getTermVectors(docID - starts[i]); // dispatch to subreader
}

public static HitEnum fromTermVectors(IndexReader reader, int docId, String fieldName,
    CompiledAutomaton acceptable, TermWeigher<BytesRef> queryWeigher,
    TermWeigher<BytesRef> corpusWeigher, TermSourceFinder<BytesRef> sourceFinder)
    throws IOException {
  Fields vectors = reader.getTermVectors(docId);
  if (vectors == null) {
    // No term vectors so no hits
    return EmptyHitEnum.INSTANCE;
  }
  return fromTerms(vectors.terms(fieldName), acceptable, -1, queryWeigher,
      corpusWeigher, sourceFinder);
}

/** 
 * checks that term vectors across all fields are equivalent 
 */
public void assertTermVectorsEquals(String info, IndexReader leftReader, IndexReader rightReader) throws IOException {
 assert leftReader.maxDoc() == rightReader.maxDoc();
 for (int i = 0; i < leftReader.maxDoc(); i++) {
  Fields leftFields = leftReader.getTermVectors(i);
  Fields rightFields = rightReader.getTermVectors(i);
  assertFieldsEquals(info, leftReader, leftFields, rightFields, rarely());
 }
}

  @Override
  public int intVal(int docNum) {
    try{
      // SH:  Fastest method to do this is if the field has term vectors stored
      //      else we have to re-analyze the field, which is not efficient
      // http://stackoverflow.com/questions/3574106/how-to-count-the-number-of-terms-for-each-document-in-lucene-index
      final Fields vectors = ir.getTermVectors(docNum);
      if(vectors != null){
        if (vectors != null) {
          Terms vector = vectors.terms(indexedField);
          if(vector != null) {
            return (int) vector.size();
          }
        }
      }
    } catch(java.io.IOException ex){
      throw new RuntimeException("caught exception in function " + description()+" while reading term vectors for doc : doc=" + docNum, ex);
    }
    return getFieldLengthFromAnalysisChain(docNum, indexedField, ir);
  }
};

/**
 * A convenience method that tries a number of approaches to getting a token
 * stream. The cost of finding there are no termVectors in the index is
 * minimal (1000 invocations still registers 0 ms). So this "lazy" (flexible?)
 * approach to coding is probably acceptable
 * 
 * @return null if field not stored correctly
 * @throws IOException If there is a low-level I/O error
 */
@Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
  String field, Analyzer analyzer) throws IOException {
 TokenStream ts = null;
 Fields vectors = reader.getTermVectors(docId);
 if (vectors != null) {
  Terms vector = vectors.terms(field);
  if (vector != null) {
   ts = getTokenStream(vector);
  }
 }
 // No token info stored so fall back to analyzing raw content
 if (ts == null) {
  ts = getTokenStream(reader, docId, field, analyzer);
 }
 return ts;
}

Map<String, Int> termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
 final Fields vectors = ir.getTermVectors(docNum);
 final Terms vector;
 if (vectors != null) {

Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
 final Fields vectors = ir.getTermVectors(docNum);
 final Terms vector;
 if (vectors != null) {

private void extractFeaturesFromDocument(int docNum, IndexReader ir, String featureLabel, Map<String, Map<String,Integer>> featureMap) throws IOException {
  if(fields == null || fields.length == 0){
    return;
  }
  final Fields vectors = ir.getTermVectors(docNum);
  final Document document = ir.document(docNum);
  for (String fieldName : fields) {
    Terms vector = null;
    if (vectors != null) {
      vector = vectors.terms(fieldName);
    }
    // field does not store term vector info
    // even if term vectors enabled, need to extract payload from regular field reader
    if (vector == null) {
      IndexableField docFields[] = document.getFields(fieldName);
      for (IndexableField field : docFields) {
        final String stringValue = field.stringValue();
        if (stringValue != null) {
          List<String> lstTerms = TermExtractionHelper.getTermsFromString(analyzer, fieldName, stringValue);
          Set<String> newTerms = new HashSet<String>(lstTerms);
          this.addFeaturesToMap(fieldName, featureLabel, newTerms, featureMap);
        }
      }
    } else {
      List<String> lstTerms = TermExtractionHelper.getTermsFromTermVectorField(vector);
      Set<String> newTerms = new HashSet<String>(lstTerms);
      this.addFeaturesToMap(fieldName, featureLabel, newTerms, featureMap);
    }
  }
}

Map<String, Int> termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
  final Fields vectors = ir.getTermVectors(docNum);
  final Terms vector;
  if (vectors != null) {

 @Override
 public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
  IndexReader reader = searcher.getIndexReader();
  highlighter.setFragmentScorer(new QueryScorer(q));
  // highlighter.setTextFragmenter();  unfortunately no sentence mechanism, not even regex. Default here is trivial
  for (ScoreDoc scoreDoc : docIdOrder(hits.scoreDocs)) {
   Document document = reader.document(scoreDoc.doc, hlFields);
   Fields tvFields = termVecs ? reader.getTermVectors(scoreDoc.doc) : null;
   for (IndexableField indexableField : document) {
    TokenStream tokenStream;
    if (termVecs) {
     tokenStream = TokenSources.getTokenStream(indexableField.name(), tvFields,
       indexableField.stringValue(), analyzer, maxDocCharsToAnalyze);
    } else {
     tokenStream = analyzer.tokenStream(indexableField.name(), indexableField.stringValue());
    }
    // will close TokenStream:
    String[] fragments = highlighter.getBestFragments(tokenStream, indexableField.stringValue(), maxFrags);
    preventOptimizeAway = fragments.length;
   }
  }
 }
}

  private void runQuery(String query, int expectedPosition) throws ParseException, IOException {
    HebrewQueryParser hqp =
        new HebrewQueryParser("Text", analyzer);

    Query q = hqp.parse(query);

    TopDocs td = searcher.search(q, searcher.getIndexReader().maxDoc());

    int num = td.scoreDocs[0].doc;
    Terms terms = searcher.getIndexReader().getTermVectors(num).terms("Text");

    Set<Term> trms_list = new HashSet<>();
    searcher.createWeight(q,true, 1.0f).extractTerms(trms_list);
//        q.extractTerms(trms_list);

    for (Term t : trms_list) {
      TermsEnum termsEnum = terms.iterator();
      boolean isFound = termsEnum.seekExact(t.bytes());
      Assert.assertTrue(isFound);

      PostingsEnum dpEnum = termsEnum.postings(null);
      assertTrue(dpEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      int pos = dpEnum.nextPosition();
      //assertEquals(expectedPosition, dpEnum.startOffset());
      //assertEquals(??, dpEnum.endOffset());
      assertEquals(DocIdSetIterator.NO_MORE_DOCS, dpEnum.nextDoc());
      assertEquals(pos, expectedPosition);
    }
  }
}

Javadoc

Retrieve term vectors for this document, or null if term vectors were not indexed. The returned Fields instance acts like a single-document inverted index (the docID will be 0).

Popular methods of IndexReader

close
Closes files associated with this index. Also saves any new deletions to disk. No other methods shou
numDocs
Returns the number of documents in this index.
maxDoc
Returns one greater than the largest possible document number. This may be used to, e.g., determine
document
open
docFreq
Returns the number of documents containing theterm. This method returns 0 if the term or field does
leaves
Returns the reader's leaves, or itself if this reader is atomic. This is a convenience method callin
terms
Returns an enumeration of all terms starting at a given term. If the given term does not exist, the
termDocs
Returns an enumeration of all the documents which containterm. For each document, the document numbe
indexExists
Returns true if an index exists at the specified directory. If the directory does not exist or if th
hasDeletions
Returns true if any documents have been deleted. Implementers should consider overriding this method
isDeleted
Returns true if document n has been deleted

Popular in Java

Reactive rest calls using spring rest template
compareTo (BigDecimal)
notifyDataSetChanged (ArrayAdapter)
getExternalFilesDir (Context)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Notification (javax.management)
JFrame (javax.swing)
Table (org.hibernate.mapping)
A relational table
Top PhpStorm plugins

How to use getTermVectorsmethodin org.apache.lucene.index.IndexReader

Best Java code snippets using org.apache.lucene.index.IndexReader.getTermVectors (Showing top 20 results out of 315)

How to use
getTermVectors
method
in
org.apache.lucene.index.IndexReader