org.terrier.structures.CollectionStatistics java code examples

/**
 * Constructor.
 */
public static MultiStats factory(CollectionStatistics[] stats) {
  int numDocs = 0, numTerms = 0;
  long numTokens = 0, numPointers = 0;
  long[] fieldTokens = new long[] { 0 };
  for (CollectionStatistics stat : stats) {
    numDocs += stat.getNumberOfDocuments();
    numTokens += stat.getNumberOfTokens();
    numPointers += stat.getNumberOfPointers();
    if (stat.getNumberOfUniqueTerms() > numTerms)
      numTerms = stat.getNumberOfUniqueTerms();
  }
  return new MultiStats(numDocs, numTerms, numTokens, numPointers,
      fieldTokens);
}

@Override
public void setCollectionStatistics(CollectionStatistics _cs) 
{
  super.setCollectionStatistics(_cs);
  int fieldCount = _cs.getNumberOfFields();
  if (fieldCount < 1)
    throw new IllegalStateException("Fields must be 1 or more");
  long tokens = 0;
  final long[] tokensf = _cs.getFieldTokens();
  for(int fieldId : activeFieldIds)
  {
    tokens += tokensf[fieldId];
  }
  
  super.numberOfTokens = tokens;
  super.averageDocumentLength = (double)tokens / (double)_cs.getNumberOfDocuments();
  
  basicModel.setCollectionStatistics(
      new CollectionStatistics(_cs.getNumberOfDocuments(), _cs.getNumberOfUniqueTerms(), tokens, _cs.getNumberOfPointers(), new long[0]));
}

/**
 * Constructs an instance of ExpansionTerms.
* @param collStats Statistics of the used corpora
* @param _lexicon Lexicon The lexicon used for retrieval.
* @param _directIndex DirectIndex to use for finding terms for documents
* @param _documentIndex DocumentIndex to use for finding statistics about documents
 */
public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon<String> _lexicon, PostingIndex<?> _directIndex, DocumentIndex _documentIndex) {
  this.numberOfDocuments = collStats.getNumberOfDocuments();
  this.numberOfTokens = collStats.getNumberOfTokens();
  this.averageDocumentLength = collStats.getAverageDocumentLength();
  this.terms = new TIntObjectHashMap<ExpansionTerm>();
  this.totalDocumentLength = 0;
  this.lexicon = _lexicon;
  this.documentIndex = _documentIndex;
  this.directIndex = _directIndex;
}

/** Returns a concrete representation of an index's statistics */
public String toString()
{
  return 
    "Number of documents: " + getNumberOfDocuments() + "\n" + 
    "Number of terms: " + getNumberOfUniqueTerms() + "\n"  + 
    "Number of fields: " + getNumberOfFields() + "\n" + 
    "Number of tokens: " + getNumberOfTokens() + "\n";		
}

/** Sets the collection statistics used to score the documents (number of documents in the collection, etc)*/
public void setCollectionStatistics(CollectionStatistics cs, Index _index) {
  numTokens = (double)cs.getNumberOfTokens();
  long numDocs = (long) (cs.getNumberOfDocuments());
  avgDocLen = ((double) (numTokens - numDocs
      * (ngramLength - 1)))
      / (double) numDocs;
  
}
/** Calculate the score for a document (from the given posting for that document)*/

/** Increment the statistics by the specified amount */
public void addStatistics(CollectionStatistics cs)
{
  numberOfDocuments += cs.getNumberOfDocuments();
  numberOfPointers += cs.getNumberOfPointers();
  numberOfTokens += cs.getNumberOfTokens();
  numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms);
  final long[] otherFieldTokens = cs.getFieldTokens();
  for(int fi=0;fi<numberOfFields;fi++)
    fieldTokens[fi] += otherFieldTokens[fi];
  
  relcaluateAverageLengths();
}

@Test public void testWritable() throws Exception
{
  CollectionStatistics cs1 = new CollectionStatistics(5, 6, 7, 8, new long[]{2});
  
  
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  DataOutputStream dos = new DataOutputStream(baos);
  cs1.write(dos);
  dos.flush();
  final byte[] bytes = baos.toByteArray();
  assertTrue(bytes.length > 0);
  CollectionStatistics cs2 = new CollectionStatistics();
  cs2.readFields(new DataInputStream(new ByteArrayInputStream(bytes)));
  
  assertEquals(cs1.getNumberOfDocuments(), cs2.getNumberOfDocuments());
  assertEquals(cs1.getNumberOfUniqueTerms(), cs2.getNumberOfUniqueTerms());
  
  assertEquals(cs1.getNumberOfPointers(), cs2.getNumberOfPointers());
  assertEquals(cs1.getNumberOfTokens(), cs2.getNumberOfTokens());
  assertEquals(cs1.getAverageDocumentLength(), cs2.getAverageDocumentLength(), 0.0d);
  //TODO: test fields
  
}

@Override
public void setCollectionStatistics(CollectionStatistics _cs) {
  super.setCollectionStatistics(_cs);
  fieldCount = _cs.getNumberOfFields();
  p = new double[fieldCount];
  fieldWeights = new double[fieldCount];
  this.fieldNormalisations = new Normalisation[fieldCount];
  try{		
    for(int fi=0;fi<fieldCount;fi++)
    {
      final Normalisation nf = this.fieldNormalisations[fi] = normClass.newInstance();
      final double param = Double.parseDouble(ApplicationSetup.getProperty("c."+ fi, ""+1.0));
      nf.setParameter(param);
      nf.setNumberOfDocuments(_cs.getNumberOfDocuments());
      final long tokensf = _cs.getFieldTokens()[fi];
      nf.setNumberOfTokens(tokensf);
      nf.setAverageDocumentLength(_cs.getAverageFieldLengths()[fi]);	
      p[fi] = 1.0d / ((double)fieldCount * (double) _cs.getNumberOfDocuments());
      p[fi] = p[fi] / (fieldWeights[fi] = Double.parseDouble( ApplicationSetup.getProperty("p." + fi, "1.0d")));
    }
  
  } catch (Exception e) {
    throw new IllegalArgumentException(e);
  }
}

@Override
public void prepare() {
  super.prepare();
  //these statistics are as used by Ivory system, of which Don Metzler was one of the authors
  defaultDf = ((double) cs.getNumberOfDocuments())  / 100.0d;
  defaultCf = defaultDf * 2;
}

CollectionStatistics collStats = new CollectionStatistics();
collStats.readFields(in);
frs.setCollectionStatistics(collStats);
final boolean fields = collStats.getNumberOfFields() > 0;
final int fieldCount = collStats.getNumberOfFields();

logger.info("Started building the inverted index...");
if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0)
if (currentIndex.getCollectionStatistics().getNumberOfDocuments() == 0)

assertEquals("Number of documents is incorrect", cs.getNumberOfDocuments(), docid);
assertEquals("Number of pointers is incorrect", cs.getNumberOfPointers(), pointers);
assertEquals("Number of tokens is incorrect", cs.getNumberOfTokens(), tokens);
if (numberOfTerms > 0)

assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments());
assertEquals(2l, index.getCollectionStatistics().getNumberOfTokens());
assertEquals(2l, index.getCollectionStatistics().getNumberOfUniqueTerms());
assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments());
assertEquals(5l, index.getCollectionStatistics().getNumberOfTokens());
assertEquals(4l, index.getCollectionStatistics().getNumberOfUniqueTerms());

@SuppressWarnings("unchecked")
public PostingIndex<?> getDirectIndex() {
  int ondisk = indices.size();
  int[] offsets = new int[ondisk];
  PostingIndex<?>[] postings = new PostingIndex[ondisk];
  int i = 0;
  for (Index index : selectiveMatchingPolicy.getSelectedIndices(indices)) {
    postings[i] = index.getDirectIndex();
    offsets[i] = index.getCollectionStatistics().getNumberOfUniqueTerms();
    i++;
  }
  return new MultiDirect((PostingIndex<Pointer>[]) postings, offsets);
}

    valueFactoryClass);
TerrierTimer tt = new TerrierTimer("Recompressing inverted index", index.getCollectionStatistics().getNumberOfPointers());
tt.start();
try{

System.err.println("_testSingleDocumentIndexMatchingFields: " + index.toString());
assertNotNull(index);
assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments());
assertEquals(2, index.getCollectionStatistics().getNumberOfFields());
assertEquals(2, index.getCollectionStatistics().getFieldTokens()[0]);
assertEquals(7, index.getCollectionStatistics().getFieldTokens()[1]);
assertEquals(9, index.getDocumentIndex().getDocumentLength(0));
Matching matching = makeMatching(index);
assertEquals( index.getCollectionStatistics().getNumberOfDocuments(), fatIndex.getCollectionStatistics().getNumberOfDocuments());
assertEquals(index.getCollectionStatistics().getNumberOfFields(), fatIndex.getCollectionStatistics().getNumberOfFields());
assertEquals(index.getCollectionStatistics().getFieldTokens()[0], fatIndex.getCollectionStatistics().getFieldTokens()[0]);
assertEquals(index.getCollectionStatistics().getFieldTokens()[1], fatIndex.getCollectionStatistics().getFieldTokens()[1]);

collStats.write(out);
final int fieldCount = collStats.getNumberOfFields();		
final int queryTermCount = queryTerms.length;
final boolean fields[] = new boolean[queryTermCount];

public FatFull(Index index) {
  super(index);
  fieldCount = super.collectionStatistics.getNumberOfFields(); 
}

assertEquals(123, index.getIntIndexProperty("num.field.0.Tokens", -1));
assertEquals(611, index.getIntIndexProperty("num.field.1.Tokens", -1));
assertEquals(2, index.getCollectionStatistics().getNumberOfFields());
assertEquals(123, index.getCollectionStatistics().getFieldTokens()[0]);
assertEquals(611, index.getCollectionStatistics().getFieldTokens()[1]);

/**
 * for an immutable index, use a normal collection statistics, never changes
 */
protected void loadStatistics() {
  // calculate fields
  int fieldCount = 0;
  if (this.hasIndexStructure("inverted")) {
    fieldCount = Integer.parseInt(properties.getProperty(
        "index.inverted.fields.count", "0"));
  } else if (this.hasIndexStructure("direct")) {
    fieldCount = Integer.parseInt(properties.getProperty(
        "index.direct.fields.count", "0"));
  }
  final long[] tokensF = new long[fieldCount];
  for (int fi = 0; fi < fieldCount; fi++) {
    tokensF[fi] = Long.parseLong(properties.getProperty("num.field."
        + fi + ".Tokens", "0"));
  }
  // create collection statistics
  structureCache.put(
      "collectionstatistics",
      new CollectionStatistics(Integer.parseInt(properties
          .getProperty("num.Documents", "0")), Integer
          .parseInt(properties.getProperty("num.Terms", "0")),
          Long.parseLong(properties
              .getProperty("num.Tokens", "0")), Long
              .parseLong(properties.getProperty(
                  "num.Pointers", "0")), tokensF));
}

Javadoc

This class provides basic statistics for the indexed collection of documents, such as the average length of documents, or the total number of documents in the collection.
After indexing, statistics are saved in the PREFIX.log file, along with the classes that should be used for the Lexicon, the DocumentIndex, the DirectIndex and the InvertedIndex. This means that an index knows how it was build and how it should be opened again.

Most used methods

getNumberOfDocuments
Returns the total number of documents in the collection.
getNumberOfPointers
Returns the total number of pointers in the collection.
getNumberOfTokens
Returns the total number of tokens in the collection.
getNumberOfUniqueTerms
Returns the total number of unique terms in the lexicon.
<init>
Constructs an instance of the class with
getNumberOfFields
Returns the number of fields being used to index
getAverageDocumentLength
Returns the documents' average length.
getFieldTokens
Returns the length of each field in tokens
readFields
write
getAverageFieldLengths
Returns the average length of each field in tokens
relcaluateAverageLengths

Popular in Java

Making http requests using okhttp
runOnUiThread (Activity)
notifyDataSetChanged (ArrayAdapter)
setRequestProperty (URLConnection)
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
EOFException (java.io)
Thrown when a program encounters the end of a file or stream during an input operation.
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Best plugins for Eclipse

How to useCollectionStatistics in org.terrier.structures

Best Java code snippets using org.terrier.structures.CollectionStatistics (Showing top 20 results out of 315)

How to use
CollectionStatistics
in
org.terrier.structures