org.apache.lucene.index.IndexReader.getTermVector java code examples

/**
 * Extracts terms of the documents; Adds them to vector in the same order
 *
 * @param hits - from which to extract terms
 *
 * @return docsTerms docs must be in order
 */
public Vector<Terms> getDocsTerms( Vector<Document> hits)
    throws IOException, ParseException {
  Vector<Terms> docsTerms = new Vector<>();
  // Process each of the documents
  for ( int i = 0; i < hits.size(); i++ ) {
    Document doc = hits.elementAt( i );
    int docid = num2id.get(doc.get("docnum"));
    Terms t = reader.getTermVector(docid, "all");
    docsTerms.add(t);
  }
  return docsTerms;
}

public ArrayList<Terms> getTermFreqVectors(String queryString) throws ParseException, IOException{
    String escaped = QueryParser.escape(queryString);
    Query query = queryParser.parse(escaped);
    ScoreDoc[] scoreDocs = indexSearcher.search(query, maxHits).scoreDocs;
    ArrayList<Terms> termFreqVectors = new ArrayList<Terms>();
    for(ScoreDoc scoreDoc : scoreDocs) {
      ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc);
      Terms termFreqVector = indexReader.getTermVector(redirectScoreDoc.doc, "text");
      termFreqVectors.add(termFreqVector);
    }
    return termFreqVectors;
}

public ArrayList<Terms> getTermFreqVectors(String queryString) throws ParseException, IOException{
    String escaped = QueryParser.escape(queryString);
    Query query = queryParser.parse(escaped);
    ScoreDoc[] scoreDocs = indexSearcher.search(query, maxHits).scoreDocs;
    ArrayList<Terms> termFreqVectors = new ArrayList<Terms>();
    for(ScoreDoc scoreDoc : scoreDocs) {
      ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc);
      Terms termFreqVector = indexReader.getTermVector(redirectScoreDoc.doc, "text");
      termFreqVectors.add(termFreqVector);
    }
    return termFreqVectors;
}

public float customScore(int doc, float subQueryScore, float valSrcScores[]) throws IOException {
  IndexReader r = context.reader();
  Terms tv = r.getTermVector(doc, _field);
  TermsEnum termsEnum = tv.iterator();
  int numTerms = 0;
  while((termsEnum.next()) != null) {
    numTerms++;
  }
  return (float)(numTerms);
}

/**
 * Get the frequency of each term contained in the document.
 * @param reader
 * @param docId
 * @return
 * @throws IOException
 */
private Map<String, Integer> getTermFrequencies(IndexReader reader, int docId)
    throws IOException {
  Terms vector = reader.getTermVector(docId, CONTENT);
  TermsEnum termsEnum = vector.iterator();
  Map<String, Integer> frequencies = new HashMap<>();
  BytesRef text = null;
  while ((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    int freq = (int) termsEnum.totalTermFreq();
    frequencies.put(term, freq);
    terms.add(term);
  }
  return frequencies;
}

/**
 * returns term freq for a given doc.
 * 
 * @param reader
 * @param field
 * @return
 * @throws IOException
 */
public static Map<String, Float> getTfs(IndexReader reader, String field,
    int docID) throws IOException {
  Map<String, Float> termFrequencies = new HashMap<>();
  Terms terms = reader.getTermVector(docID, field);
  TermsEnum itr = terms.iterator();
  BytesRef term = null;
  while ((term = itr.next()) != null) {
    String termText = term.utf8ToString();
    long termFreq = itr.totalTermFreq(); // term freq in doc with docID
    termFrequencies.put(termText, (float) termFreq);
  }
  return termFrequencies;
}

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext<Integer> context) {
 Document[] documents = docs.documents;
 IndexReader reader = context.getIndexSearcher().getIndexReader();
 int qid = context.getQueryId();
 LOG.info("Beginning rerank");
 for (int i =0; i < docs.documents.length; i++ ) {
  try {
   Terms terms = reader.getTermVector(docs.ids[i], LuceneDocumentGenerator.FIELD_BODY);
   float[] features = this.extractorChain.extractAll(documents[i], terms, context);
   String docId = documents[i].get(LuceneDocumentGenerator.FIELD_ID);
   // QREL 0 in this case, will be assigned if needed later
   //qid
   BaseFeatureExtractor.writeFeatureVector(out, qid, this.qrels.getRelevanceGrade(qid, docId), docId,  features);
   LOG.info("Finished writing vectors");
  } catch (IOException e) {
   LOG.error(String.format("IOExecption trying to retrieve feature vector for %d doc", docs.ids[i]));
   continue;
  }
 }
 // Does nothing to the actual docs, we just need to extract the feature vector
 return docs;
}

/**
 * returns term freq for a given doc.
 * 
 * @param reader
 * @param field
 * @return
 * @throws IOException
 */
public static Map<String, Float> getTfs(IndexReader reader, String field,
    int docID) throws IOException {
  Map<String, Float> termFrequencies = new HashMap<>();
  Terms terms = reader.getTermVector(docID, field);
  TermsEnum itr = terms.iterator();
  BytesRef term = null;
  while ((term = itr.next()) != null) {
    String termText = term.utf8ToString();
    long termFreq = itr.totalTermFreq(); // term freq in doc with docID
    termFrequencies.put(termText, (float) termFreq);
  }
  return termFrequencies;
}

/**
 * returns term freq for a given doc.
 * 
 * @param reader
 * @param field
 * @param tfidfSIM
 * @return
 * @throws IOException
 */
public static Map<String, Float> getTfs(IndexReader reader, String field,
    int docID) throws IOException {
  Map<String, Float> termFrequencies = new HashMap<>();
  Terms terms = reader.getTermVector(docID, field);
  TermsEnum itr = terms.iterator(null);
  BytesRef term = null;
  while ((term = itr.next()) != null) {
    String termText = term.utf8ToString();
    long termFreq = itr.totalTermFreq(); // term freq in doc with docID
    termFrequencies.put(termText, (float) termFreq);
  }
  return termFrequencies;
}

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
 IndexReader reader = context.getIndexSearcher().getIndexReader();
 for (int i = 0; i < docs.documents.length; i++) {
  Terms terms = null;
  try {
   terms = reader.getTermVector(docs.ids[i], TweetGenerator.FIELD_BODY);
  } catch (IOException e) {
   continue;
  }
  String qid = ((String)context.getQueryId()).replaceFirst("^MB0*", "");
  String docid = docs.documents[i].getField(TweetGenerator.FIELD_ID).stringValue();
  out.print(qrels.getRelevanceGrade(qid, docid));
  out.print(" qid:" + qid);
  out.print(" 1:" + docs.scores[i]);
  float[] intFeatures = this.extractorChain.extractAll(docs.documents[i], terms, context);
  for (int j=0; j<intFeatures.length; j++ ) {
   out.print(" " + (j+2) + ":" + intFeatures[j]);
  }
  out.print(" # docid:" + docid);
  out.print("\n");
 }
 return docs;
}

private Map<String, Long> convertDocVectorToMap(IndexReader reader, String docid) {
 Map<String, Long> m = new HashMap<>();
 try {
  Terms terms = reader.getTermVector(
    NewsBackgroundLinkingTopicReader.convertDocidToLuceneDocid(reader, docid), FIELD_BODY);
  TermsEnum it = terms.iterator();
  while (it.next() != null) {
   String term = it.term().utf8ToString();
   long tf = it.totalTermFreq();
   m.put(term, tf);
  }
 } catch (Exception e) {
  e.printStackTrace();
 }
 return m;
}

public void docLength(int docid) throws IOException{
  /*
  The direct index must be stored for this to work... how do we store it, though?
   */
  Terms t = reader.getTermVector(docid, "title");
  System.out.println("title Length: " + t.getSumDocFreq());
  long tot = 0;
  if ((t != null) && (t.size()>0)) {
    tot = tot + t.size();
    System.out.println("title: " + t.size());
    TermsEnum te = t.iterator();
    BytesRef term = null;
    System.out.println(t.size());
    while ((term = te.next()) != null) {
      System.out.println("terms: " + term.utf8ToString());
    }
  }
  t = reader.getTermVector(docid, "content");
  if ((t != null) && (t.size()>0)) {
    tot = tot + t.size();
    System.out.println("content: " + t.size());
  }
  System.out.println("Doc Length: " + tot);
}

private void updateTermCountMap(int doc_id, double weight) {
  try {
    Terms t = reader.getTermVector(doc_id, field);
    if ((t != null) && (t.size() > 0)) {
      TermsEnum te = t.iterator();
      BytesRef term;
      PostingsEnum p = null;
      while ((term = te.next()) != null) {
        String termText = term.utf8ToString();
        if (termCounts.containsKey(termText)) {
          double v = termCounts.get(termText);
          termCounts.put(termText, v + (te.totalTermFreq() * weight));
        } else {
          termCounts.put(termText, (te.totalTermFreq() * weight));
        }
        doc_len = doc_len + (te.totalTermFreq() * weight);
        p = te.postings(p, PostingsEnum.ALL);
      }
    }
  } catch (IOException e) {
    e.printStackTrace();
    System.exit(1);
  }
}

Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext<Integer> context) {
 IndexReader reader = context.getIndexSearcher().getIndexReader();
 for (int i = 0; i < docs.documents.length; i++) {
  Terms terms = null;
  try {
   terms = reader.getTermVector(docs.ids[i], TweetGenerator.FIELD_BODY);
  } catch (IOException e) {
   continue;
  }
  int qid = context.getQueryId();
  String docid = docs.documents[i].getField( TweetGenerator.FIELD_ID).stringValue();
  out.print(qrels.getRelevanceGrade(qid, docid));
  out.print(" qid:" + qid);
  float[] intFeatures = this.extractors.extractAll(docs.documents[i], terms, context);
  // TODO use model to rerank
 }
 return docs;
}

private void updateWeights(IndexReader indexReader,
              int docId, Boolean assignedClass, SortedMap<String, Double> weights,
              double modifier, boolean updateFST) throws IOException {
 TermsEnum cte = textTerms.iterator();
 // get the doc term vectors
 Terms terms = indexReader.getTermVector(docId, textFieldName);
 if (terms == null) {
  throw new IOException("term vectors must be stored for field "
      + textFieldName);
 }
 TermsEnum termsEnum = terms.iterator();
 BytesRef term;
 while ((term = termsEnum.next()) != null) {
  cte.seekExact(term);
  if (assignedClass != null) {
   long termFreqLocal = termsEnum.totalTermFreq();
   // update weights
   Long previousValue = Util.get(fst, term);
   String termString = term.utf8ToString();
   weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal));
  }
 }
 if (updateFST) {
  updateFST(weights);
 }
}

  String luceneName, Map<String, Integer> freq) {
try {
  org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
  if (terms == null) {
    throw new IllegalArgumentException("Field " + luceneName + " has no Terms");

Terms t = reader.getTermVector(docid, "title");

@Test
public void testNoTermVector() throws Exception {
  FullTextSession s = Search.getFullTextSession( openSession() );
  Transaction tx = s.beginTransaction();
  Employee e1 = new Employee( 1000, "Griffin", "ITech" );
  s.save( e1 );
  tx.commit();
  s.clear();
  tx = s.beginTransaction();
  // Here's how to get a reader from a FullTextSession
  SearchFactory searchFactory = s.getSearchFactory();
  IndexReader reader = searchFactory.getIndexReaderAccessor().open( Employee.class );
  Terms termVector = reader.getTermVector( 0, "dept" );
  assertNull( "should not find a term position vector", termVector );
  // cleanup
  for ( Object element : s.createQuery( "from " + ElectricalProperties.class.getName() ).list() ) {
    s.delete( element );
  }
  searchFactory.getIndexReaderAccessor().close( reader );
  tx.commit();
  s.close();
}

Terms termVector = reader.getTermVector( x, "content" );
assertNotNull( termVector );
TermsEnum iterator = termVector.iterator();

Javadoc

Retrieve term vector for this document and field, or null if term vectors were not indexed. The returned Fields instance acts like a single-document inverted index (the docID will be 0).

Popular methods of IndexReader

close
Closes files associated with this index. Also saves any new deletions to disk. No other methods shou
numDocs
Returns the number of documents in this index.
maxDoc
Returns one greater than the largest possible document number. This may be used to, e.g., determine
document
open
docFreq
Returns the number of documents containing theterm. This method returns 0 if the term or field does
leaves
Returns the reader's leaves, or itself if this reader is atomic. This is a convenience method callin
terms
Returns an enumeration of all terms starting at a given term. If the given term does not exist, the
termDocs
Returns an enumeration of all the documents which containterm. For each document, the document numbe
indexExists
Returns true if an index exists at the specified directory. If the directory does not exist or if th
hasDeletions
Returns true if any documents have been deleted. Implementers should consider overriding this method
isDeleted
Returns true if document n has been deleted

Popular in Java

Making http requests using okhttp
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
setContentView (Activity)
getSystemService (Context)
String (java.lang)
URLEncoder (java.net)
This class is used to encode a string using the format required by application/x-www-form-urlencoded
ResourceBundle (java.util)
ResourceBundle is an abstract class which is the superclass of classes which provide Locale-specifi
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
JButton (javax.swing)
Best plugins for Eclipse

How to use getTermVectormethodin org.apache.lucene.index.IndexReader

Best Java code snippets using org.apache.lucene.index.IndexReader.getTermVector (Showing top 20 results out of 315)

How to use
getTermVector
method
in
org.apache.lucene.index.IndexReader