/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq; if (collectionStats.sumTotalTermFreq() == -1) { // frequencies are omitted (tf=1), its # of postings if (collectionStats.sumDocFreq() == -1) { // theoretical case only: remove! return 1f; } sumTotalTermFreq = collectionStats.sumDocFreq(); } else { sumTotalTermFreq = collectionStats.sumTotalTermFreq(); } final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", Explanation.match(df, "docFreq"), Explanation.match(docCount, "docCount")); }
/** * Computes a score factor for a simple term and returns an explanation * for that score factor. * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); }
/** * Computes a score factor for a simple term and returns an explanation * for that score factor. * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:", Explanation.match(df, "docFreq"), Explanation.match(docCount, "docCount")); }
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException { if (dfs != null) { return dfs.fieldStatistics().get(fieldName).docCount(); } return topLevelTerms.getDocCount(); }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
@Override public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { Query query = new Query(boost); long docCount = collectionStats.docCount(); if (docCount == -1) { docCount = collectionStats.maxDoc(); } Field field = new Field(docCount, collectionStats.sumDocFreq(), collectionStats.sumTotalTermFreq()); Term[] terms = new Term[termStats.length]; for (int i = 0; i < termStats.length; ++i) { terms[i] = new Term(termStats[i].docFreq(), termStats[i].totalTermFreq()); } return new Weight(collectionStats.field(), query, field, terms); }
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { out.writeVInt(fieldStatistics.size()); for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) { out.writeString(c.key); CollectionStatistics statistics = c.value; assert statistics.maxDoc() >= 0; out.writeVLong(statistics.maxDoc()); out.writeVLong(addOne(statistics.docCount())); out.writeVLong(addOne(statistics.sumTotalTermFreq())); out.writeVLong(addOne(statistics.sumDocFreq())); } }
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException { if (dfs != null) { return dfs.fieldStatistics().get(fieldName).docCount(); } return topLevelTerms.getDocCount(); }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", Explanation.match(df, "docFreq"), Explanation.match(docCount, "docCount")); }
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException { if (dfs != null) { return dfs.fieldStatistics().get(fieldName).docCount(); } return topLevelTerms.getDocCount(); }
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException { if (dfs != null) { return dfs.fieldStatistics().get(fieldName).docCount(); } return topLevelTerms.getDocCount(); }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { out.writeVInt(fieldStatistics.size()); for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) { out.writeString(c.key); CollectionStatistics statistics = c.value; assert statistics.maxDoc() >= 0; out.writeVLong(statistics.maxDoc()); out.writeVLong(addOne(statistics.docCount())); out.writeVLong(addOne(statistics.sumTotalTermFreq())); out.writeVLong(addOne(statistics.sumDocFreq())); } }
public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException { out.writeVInt(fieldStatistics.size()); for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) { out.writeString(c.key); CollectionStatistics statistics = c.value; assert statistics.maxDoc() >= 0; out.writeVLong(statistics.maxDoc()); out.writeVLong(addOne(statistics.docCount())); out.writeVLong(addOne(statistics.sumTotalTermFreq())); out.writeVLong(addOne(statistics.sumDocFreq())); } }