/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq; if (collectionStats.sumTotalTermFreq() == -1) { // frequencies are omitted (tf=1), its # of postings if (collectionStats.sumDocFreq() == -1) { // theoretical case only: remove! return 1f; } sumTotalTermFreq = collectionStats.sumDocFreq(); } else { sumTotalTermFreq = collectionStats.sumTotalTermFreq(); } final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); }
/** * Returns {@link CollectionStatistics} for a field. * * This can be overridden for example, to return a field's statistics * across a distributed collection. * @lucene.experimental */ public CollectionStatistics collectionStatistics(String field) throws IOException { final int docCount; final long sumTotalTermFreq; final long sumDocFreq; assert field != null; Terms terms = MultiFields.getTerms(reader, field); if (terms == null) { docCount = 0; sumTotalTermFreq = 0; sumDocFreq = 0; } else { docCount = terms.getDocCount(); sumTotalTermFreq = terms.getSumTotalTermFreq(); sumDocFreq = terms.getSumDocFreq(); } return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq); } }
@Override public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { PerFieldSimWeight weight = new PerFieldSimWeight(); weight.delegate = get(collectionStats.field()); weight.delegateWeight = weight.delegate.computeWeight(boost, collectionStats, termStats); return weight; }
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", Explanation.match(df, "docFreq"), Explanation.match(docCount, "docCount")); }
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { float N, n, idf, adl; idf = 1.0f; N = collectionStats.maxDoc(); adl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf = log(N/n); } else { for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf += log(N/n); } } return new TFIDFWeight(collectionStats.field(), idf, adl); }
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc()); } }
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats ) { final long df = stat.docFreq(); final float termIdf = idf(df, max); details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", details); }
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException { if (dfs != null) { return dfs.fieldStatistics().get(fieldName).docCount(); } return topLevelTerms.getDocCount(); }
public long sumttf() throws IOException { return fieldStats.sumTotalTermFreq(); }
public long sumdf() throws IOException { return fieldStats.sumDocFreq(); }
public void reportCollectionStatistics()throws IOException { IndexSearcher searcher = new IndexSearcher(reader); CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL); long token_count = collectionStats.sumTotalTermFreq(); long doc_count = collectionStats.docCount(); long sum_doc_count = collectionStats.sumDocFreq(); long avg_doc_length = token_count / doc_count; System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); }
/** * Computes a score factor for a simple term and returns an explanation * for that score factor. * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"); }
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). * * @param collectionStats collection-wide statistics * @return average document length of FIELD_BODY * */ float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc()); } }
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */ public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { final long max = collectionStats.maxDoc(); float idf = 0.0f; List<Explanation> subs = new ArrayList<>(); for (final TermStatistics stat : termStats ) { final long df = stat.docFreq(); final float termIdf = idf(df, max); subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")")); idf += termIdf; } return Explanation.match(idf, "idf(), sum of:", subs); }
public long docCount() throws IOException { return fieldStats.docCount(); }
public long sumttf() throws IOException { return fieldStats.sumTotalTermFreq(); }
public long sumdf() throws IOException { return fieldStats.sumDocFreq(); }