/** * Extracts terms of the documents; Adds them to vector in the same order * * @param hits - from which to extract terms * * @return docsTerms docs must be in order */ public Vector<Terms> getDocsTerms( Vector<Document> hits) throws IOException, ParseException { Vector<Terms> docsTerms = new Vector<>(); // Process each of the documents for ( int i = 0; i < hits.size(); i++ ) { Document doc = hits.elementAt( i ); int docid = num2id.get(doc.get("docnum")); Terms t = reader.getTermVector(docid, "all"); docsTerms.add(t); } return docsTerms; }
public ArrayList<Terms> getTermFreqVectors(String queryString) throws ParseException, IOException{ String escaped = QueryParser.escape(queryString); Query query = queryParser.parse(escaped); ScoreDoc[] scoreDocs = indexSearcher.search(query, maxHits).scoreDocs; ArrayList<Terms> termFreqVectors = new ArrayList<Terms>(); for(ScoreDoc scoreDoc : scoreDocs) { ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc); Terms termFreqVector = indexReader.getTermVector(redirectScoreDoc.doc, "text"); termFreqVectors.add(termFreqVector); } return termFreqVectors; }
public ArrayList<Terms> getTermFreqVectors(String queryString) throws ParseException, IOException{ String escaped = QueryParser.escape(queryString); Query query = queryParser.parse(escaped); ScoreDoc[] scoreDocs = indexSearcher.search(query, maxHits).scoreDocs; ArrayList<Terms> termFreqVectors = new ArrayList<Terms>(); for(ScoreDoc scoreDoc : scoreDocs) { ScoreDoc redirectScoreDoc = handlePossibleRedirect(scoreDoc); Terms termFreqVector = indexReader.getTermVector(redirectScoreDoc.doc, "text"); termFreqVectors.add(termFreqVector); } return termFreqVectors; }
public float customScore(int doc, float subQueryScore, float valSrcScores[]) throws IOException { IndexReader r = context.reader(); Terms tv = r.getTermVector(doc, _field); TermsEnum termsEnum = tv.iterator(); int numTerms = 0; while((termsEnum.next()) != null) { numTerms++; } return (float)(numTerms); }
/** * Get the frequency of each term contained in the document. * @param reader * @param docId * @return * @throws IOException */ private Map<String, Integer> getTermFrequencies(IndexReader reader, int docId) throws IOException { Terms vector = reader.getTermVector(docId, CONTENT); TermsEnum termsEnum = vector.iterator(); Map<String, Integer> frequencies = new HashMap<>(); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); frequencies.put(term, freq); terms.add(term); } return frequencies; }
/** * returns term freq for a given doc. * * @param reader * @param field * @return * @throws IOException */ public static Map<String, Float> getTfs(IndexReader reader, String field, int docID) throws IOException { Map<String, Float> termFrequencies = new HashMap<>(); Terms terms = reader.getTermVector(docID, field); TermsEnum itr = terms.iterator(); BytesRef term = null; while ((term = itr.next()) != null) { String termText = term.utf8ToString(); long termFreq = itr.totalTermFreq(); // term freq in doc with docID termFrequencies.put(termText, (float) termFreq); } return termFrequencies; }
@Override public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext<Integer> context) { Document[] documents = docs.documents; IndexReader reader = context.getIndexSearcher().getIndexReader(); int qid = context.getQueryId(); LOG.info("Beginning rerank"); for (int i =0; i < docs.documents.length; i++ ) { try { Terms terms = reader.getTermVector(docs.ids[i], LuceneDocumentGenerator.FIELD_BODY); float[] features = this.extractorChain.extractAll(documents[i], terms, context); String docId = documents[i].get(LuceneDocumentGenerator.FIELD_ID); // QREL 0 in this case, will be assigned if needed later //qid BaseFeatureExtractor.writeFeatureVector(out, qid, this.qrels.getRelevanceGrade(qid, docId), docId, features); LOG.info("Finished writing vectors"); } catch (IOException e) { LOG.error(String.format("IOExecption trying to retrieve feature vector for %d doc", docs.ids[i])); continue; } } // Does nothing to the actual docs, we just need to extract the feature vector return docs; }
/** * returns term freq for a given doc. * * @param reader * @param field * @return * @throws IOException */ public static Map<String, Float> getTfs(IndexReader reader, String field, int docID) throws IOException { Map<String, Float> termFrequencies = new HashMap<>(); Terms terms = reader.getTermVector(docID, field); TermsEnum itr = terms.iterator(); BytesRef term = null; while ((term = itr.next()) != null) { String termText = term.utf8ToString(); long termFreq = itr.totalTermFreq(); // term freq in doc with docID termFrequencies.put(termText, (float) termFreq); } return termFrequencies; }
/** * returns term freq for a given doc. * * @param reader * @param field * @param tfidfSIM * @return * @throws IOException */ public static Map<String, Float> getTfs(IndexReader reader, String field, int docID) throws IOException { Map<String, Float> termFrequencies = new HashMap<>(); Terms terms = reader.getTermVector(docID, field); TermsEnum itr = terms.iterator(null); BytesRef term = null; while ((term = itr.next()) != null) { String termText = term.utf8ToString(); long termFreq = itr.totalTermFreq(); // term freq in doc with docID termFrequencies.put(termText, (float) termFreq); } return termFrequencies; }
@Override public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) { IndexReader reader = context.getIndexSearcher().getIndexReader(); for (int i = 0; i < docs.documents.length; i++) { Terms terms = null; try { terms = reader.getTermVector(docs.ids[i], TweetGenerator.FIELD_BODY); } catch (IOException e) { continue; } String qid = ((String)context.getQueryId()).replaceFirst("^MB0*", ""); String docid = docs.documents[i].getField(TweetGenerator.FIELD_ID).stringValue(); out.print(qrels.getRelevanceGrade(qid, docid)); out.print(" qid:" + qid); out.print(" 1:" + docs.scores[i]); float[] intFeatures = this.extractorChain.extractAll(docs.documents[i], terms, context); for (int j=0; j<intFeatures.length; j++ ) { out.print(" " + (j+2) + ":" + intFeatures[j]); } out.print(" # docid:" + docid); out.print("\n"); } return docs; }
private Map<String, Long> convertDocVectorToMap(IndexReader reader, String docid) { Map<String, Long> m = new HashMap<>(); try { Terms terms = reader.getTermVector( NewsBackgroundLinkingTopicReader.convertDocidToLuceneDocid(reader, docid), FIELD_BODY); TermsEnum it = terms.iterator(); while (it.next() != null) { String term = it.term().utf8ToString(); long tf = it.totalTermFreq(); m.put(term, tf); } } catch (Exception e) { e.printStackTrace(); } return m; }
public void docLength(int docid) throws IOException{ /* The direct index must be stored for this to work... how do we store it, though? */ Terms t = reader.getTermVector(docid, "title"); System.out.println("title Length: " + t.getSumDocFreq()); long tot = 0; if ((t != null) && (t.size()>0)) { tot = tot + t.size(); System.out.println("title: " + t.size()); TermsEnum te = t.iterator(); BytesRef term = null; System.out.println(t.size()); while ((term = te.next()) != null) { System.out.println("terms: " + term.utf8ToString()); } } t = reader.getTermVector(docid, "content"); if ((t != null) && (t.size()>0)) { tot = tot + t.size(); System.out.println("content: " + t.size()); } System.out.println("Doc Length: " + tot); }
private void updateTermCountMap(int doc_id, double weight) { try { Terms t = reader.getTermVector(doc_id, field); if ((t != null) && (t.size() > 0)) { TermsEnum te = t.iterator(); BytesRef term; PostingsEnum p = null; while ((term = te.next()) != null) { String termText = term.utf8ToString(); if (termCounts.containsKey(termText)) { double v = termCounts.get(termText); termCounts.put(termText, v + (te.totalTermFreq() * weight)); } else { termCounts.put(termText, (te.totalTermFreq() * weight)); } doc_len = doc_len + (te.totalTermFreq() * weight); p = te.postings(p, PostingsEnum.ALL); } } } catch (IOException e) { e.printStackTrace(); System.exit(1); } }
Terms termVector = luceneIndexReader.getTermVector(i, predicate.getAttribute());
@Override public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext<Integer> context) { IndexReader reader = context.getIndexSearcher().getIndexReader(); for (int i = 0; i < docs.documents.length; i++) { Terms terms = null; try { terms = reader.getTermVector(docs.ids[i], TweetGenerator.FIELD_BODY); } catch (IOException e) { continue; } int qid = context.getQueryId(); String docid = docs.documents[i].getField( TweetGenerator.FIELD_ID).stringValue(); out.print(qrels.getRelevanceGrade(qid, docid)); out.print(" qid:" + qid); float[] intFeatures = this.extractors.extractAll(docs.documents[i], terms, context); // TODO use model to rerank } return docs; }
private void updateWeights(IndexReader indexReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException { TermsEnum cte = textTerms.iterator(); // get the doc term vectors Terms terms = indexReader.getTermVector(docId, textFieldName); if (terms == null) { throw new IOException("term vectors must be stored for field " + textFieldName); } TermsEnum termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { cte.seekExact(term); if (assignedClass != null) { long termFreqLocal = termsEnum.totalTermFreq(); // update weights Long previousValue = Util.get(fst, term); String termString = term.utf8ToString(); weights.put(termString, previousValue == null ? 0 : Math.max(0, previousValue + modifier * termFreqLocal)); } } if (updateFST) { updateFST(weights); } }
String luceneName, Map<String, Integer> freq) { try { org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName); if (terms == null) { throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
Terms t = reader.getTermVector(docid, "title");
@Test public void testNoTermVector() throws Exception { FullTextSession s = Search.getFullTextSession( openSession() ); Transaction tx = s.beginTransaction(); Employee e1 = new Employee( 1000, "Griffin", "ITech" ); s.save( e1 ); tx.commit(); s.clear(); tx = s.beginTransaction(); // Here's how to get a reader from a FullTextSession SearchFactory searchFactory = s.getSearchFactory(); IndexReader reader = searchFactory.getIndexReaderAccessor().open( Employee.class ); Terms termVector = reader.getTermVector( 0, "dept" ); assertNull( "should not find a term position vector", termVector ); // cleanup for ( Object element : s.createQuery( "from " + ElectricalProperties.class.getName() ).list() ) { s.delete( element ); } searchFactory.getIndexReaderAccessor().close( reader ); tx.commit(); s.close(); }
Terms termVector = reader.getTermVector( x, "content" ); assertNotNull( termVector ); TermsEnum iterator = termVector.iterator();