org.apache.lucene.search.CollectionStatistics java code examples

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
 final long sumTotalTermFreq;
 if (collectionStats.sumTotalTermFreq() == -1) {
  // frequencies are omitted (tf=1), its # of postings
  if (collectionStats.sumDocFreq() == -1) {
   // theoretical case only: remove!
   return 1f;
  }
  sumTotalTermFreq = collectionStats.sumDocFreq();
 } else {
  sumTotalTermFreq = collectionStats.sumTotalTermFreq();
 }
 final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
 return (float) (sumTotalTermFreq / (double) docCount);
}

 /**
  * Returns {@link CollectionStatistics} for a field.
  * 
  * This can be overridden for example, to return a field's statistics
  * across a distributed collection.
  * @lucene.experimental
  */
 public CollectionStatistics collectionStatistics(String field) throws IOException {
  final int docCount;
  final long sumTotalTermFreq;
  final long sumDocFreq;

  assert field != null;
  
  Terms terms = MultiFields.getTerms(reader, field);
  if (terms == null) {
   docCount = 0;
   sumTotalTermFreq = 0;
   sumDocFreq = 0;
  } else {
   docCount = terms.getDocCount();
   sumTotalTermFreq = terms.getSumTotalTermFreq();
   sumDocFreq = terms.getSumDocFreq();
  }

  return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
 }
}

@Override
public final SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
 PerFieldSimWeight weight = new PerFieldSimWeight();
 weight.delegate = get(collectionStats.field());
 weight.delegateWeight = weight.delegate.computeWeight(boost, collectionStats, termStats);
 return weight;
}

@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
 final long df = termStats.docFreq();
 final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
 final float idf = idf(df, docCount);
 return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
   Explanation.match(df, "docFreq"),
   Explanation.match(docCount, "docCount"));
}

private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
  long sttf = fieldStats.sumTotalTermFreq();
  assert (sttf >= -1);
  writePotentiallyNegativeVLong(sttf);
  long sdf = fieldStats.sumDocFreq();
  assert (sdf >= -1);
  writePotentiallyNegativeVLong(sdf);
  int dc = (int) fieldStats.docCount();
  assert (dc >= -1);
  writePotentiallyNegativeVInt(dc);
}

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
 final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
 if (sumTotalTermFreq <= 0) {
  return 1f;       // field does not exist, or stat is unsupported
 } else {
  final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
  return (float) (sumTotalTermFreq / (double) docCount);
 }
}

@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats,
         TermStatistics... termStats)
{
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
  n = termStats[0].docFreq();
  idf = log(N/n);
}
else {
  for (final TermStatistics stat : termStats) {
  n = stat.docFreq();
  idf += log(N/n);
  }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
}

/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
 final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
 if (sumTotalTermFreq <= 0) {
  return 1f;       // field does not exist, or stat is unsupported
 } else {
  return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
 }
}

/**
 * Computes a score factor for a phrase.
 * 
 * <p>
 * The default implementation sums the idf factor for
 * each term in the phrase.
 * 
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf 
 *         score factor for the phrase and an explanation 
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
 final long max = collectionStats.maxDoc();
 float idf = 0.0f;
 List<Explanation> details = new ArrayList<>();
 for (final TermStatistics stat : termStats ) {
  final long df = stat.docFreq();
  final float termIdf = idf(df, max);
  details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
  idf += termIdf;
 }
 return Explanation.match(idf, "idf(), sum of:", details);
}

private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
  if (dfs != null) {
    return dfs.fieldStatistics().get(fieldName).docCount();
  }
  return topLevelTerms.getDocCount();
}

public long sumttf() throws IOException {
  return fieldStats.sumTotalTermFreq();
}

public long sumdf() throws IOException {
  return fieldStats.sumDocFreq();
}

public void reportCollectionStatistics()throws IOException {
  IndexSearcher searcher = new IndexSearcher(reader);
  CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
  long token_count = collectionStats.sumTotalTermFreq();
  long doc_count = collectionStats.docCount();
  long sum_doc_count = collectionStats.sumDocFreq();
  long avg_doc_length = token_count / doc_count;
  System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
  collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
  token_count = collectionStats.sumTotalTermFreq();
  doc_count = collectionStats.docCount();
  sum_doc_count = collectionStats.sumDocFreq();
  avg_doc_length = token_count / doc_count;
  System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
  collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
  token_count = collectionStats.sumTotalTermFreq();
  doc_count = collectionStats.docCount();
  sum_doc_count = collectionStats.sumDocFreq();
  avg_doc_length = token_count / doc_count;
  System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);
}

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor.
 * 
 * <p>
 * The default implementation uses:
 * 
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 * 
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
 * {@link TermStatistics#docFreq()} is used, and when the latter 
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *   
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor 
      and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
 final long df = termStats.docFreq();
 final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
 final float idf = idf(df, docCount);
 return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information).
 *
 * @param collectionStats collection-wide statistics
 * @return average document length of FIELD_BODY
 * */
float avgFieldLength(CollectionStatistics collectionStats) {
 final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
 if (sumTotalTermFreq <= 0) {
  return 1f;       // field does not exist, or stat is unsupported
 } else {
  final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
  return (float) (sumTotalTermFreq / (double) docCount);
 }
}

/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
 final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
 if (sumTotalTermFreq <= 0) {
  return 1f;       // field does not exist, or stat is unsupported
 } else {
  return (float) (sumTotalTermFreq / (double) collectionStats.maxDoc());
 }
}

/**
 * Computes a score factor for a phrase.
 * 
 * <p>
 * The default implementation sums the idf factor for
 * each term in the phrase.
 * 
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf 
 *         score factor for the phrase and an explanation 
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
 final long max = collectionStats.maxDoc();
 float idf = 0.0f;
 List<Explanation> subs = new ArrayList<>();
 for (final TermStatistics stat : termStats ) {
  final long df = stat.docFreq();
  final float termIdf = idf(df, max);
  subs.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
  idf += termIdf;
 }
 return Explanation.match(idf, "idf(), sum of:", subs);
}

public long docCount() throws IOException {
  return fieldStats.docCount();
}

public long sumttf() throws IOException {
  return fieldStats.sumTotalTermFreq();
}

public long sumdf() throws IOException {
  return fieldStats.sumDocFreq();
}

Javadoc

Contains statistics for a collection (field)

Most used methods

maxDoc
returns the total number of documents, regardless of whether they all contain values for this field.
sumTotalTermFreq
returns the total number of tokens for this field
<init>
docCount
returns the total number of documents that have at least one term for this field.
field
returns the field name
sumDocFreq
returns the total number of postings for this field

Popular in Java

Finding current android device location
getExternalFilesDir (Context)
scheduleAtFixedRate (Timer)
onCreateOptionsMenu (Activity)
BufferedWriter (java.io)
Wraps an existing Writer and buffers the output. Expensive interaction with the underlying reader is
PrintStream (java.io)
Fake signature of an existing Java class.
BigDecimal (java.math)
An immutable arbitrary-precision signed decimal.A value is represented by an arbitrary-precision "un
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Top Vim plugins

How to useCollectionStatistics in org.apache.lucene.search

Best Java code snippets using org.apache.lucene.search.CollectionStatistics (Showing top 20 results out of 315)

How to use
CollectionStatistics
in
org.apache.lucene.search