/** Sets the collection statistics used to score the documents (number of documents in the collection, etc)*/ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { numTokens = (double)cs.getNumberOfTokens(); long numDocs = (long) (cs.getNumberOfDocuments()); avgDocLen = ((double) (numTokens - numDocs * (ngramLength - 1))) / (double) numDocs; } /** Calculate the score for a document (from the given posting for that document)*/
/** Sets the collection statistics used to score the documents (number of documents in the collection, etc)*/ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { numTokens = (double)cs.getNumberOfTokens(); long numDocs = (long) (cs.getNumberOfDocuments()); avgDocLen = ((double) (numTokens - numDocs * (ngramLength - 1))) / (double) numDocs; } /** Calculate the score for a document (from the given posting for that document)*/
/** Returns a concrete representation of an index's statistics */ public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n"; }
/** Returns a concrete representation of an index's statistics */ public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n"; }
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
/** * Constructs an instance of ExpansionTerms. * @param collStats Statistics of the used corpora * @param _lexicon Lexicon The lexicon used for retrieval. * @param _directIndex DirectIndex to use for finding terms for documents * @param _documentIndex DocumentIndex to use for finding statistics about documents */ public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon<String> _lexicon, PostingIndex<?> _directIndex, DocumentIndex _documentIndex) { this.numberOfDocuments = collStats.getNumberOfDocuments(); this.numberOfTokens = collStats.getNumberOfTokens(); this.averageDocumentLength = collStats.getAverageDocumentLength(); this.terms = new TIntObjectHashMap<ExpansionTerm>(); this.totalDocumentLength = 0; this.lexicon = _lexicon; this.documentIndex = _documentIndex; this.directIndex = _directIndex; }
/** * Constructs an instance of ExpansionTerms. * @param collStats Statistics of the used corpora * @param _lexicon Lexicon The lexicon used for retrieval. * @param _directIndex DirectIndex to use for finding terms for documents * @param _documentIndex DocumentIndex to use for finding statistics about documents */ public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon<String> _lexicon, PostingIndex<?> _directIndex, DocumentIndex _documentIndex) { this.numberOfDocuments = collStats.getNumberOfDocuments(); this.numberOfTokens = collStats.getNumberOfTokens(); this.averageDocumentLength = collStats.getAverageDocumentLength(); this.terms = new TIntObjectHashMap<ExpansionTerm>(); this.totalDocumentLength = 0; this.lexicon = _lexicon; this.documentIndex = _documentIndex; this.directIndex = _directIndex; }
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
@SuppressWarnings("deprecation") @Override public int run(String[] args) { Index.setIndexLoadingProfileAsRetrieval(false); Index i = IndexFactory.of(IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)); System.out.println("Collection statistics:"); System.out.println("number of indexed documents: " + i.getCollectionStatistics().getNumberOfDocuments()); System.out.println("size of vocabulary: " + i.getCollectionStatistics().getNumberOfUniqueTerms()); System.out.println("number of tokens: " + i.getCollectionStatistics().getNumberOfTokens()); System.out.println("number of pointers: " + i.getCollectionStatistics().getNumberOfPointers()); try { i.close(); } catch (IOException e) {} return 0; }
@SuppressWarnings("deprecation") @Override public int run(String[] args) { Index.setIndexLoadingProfileAsRetrieval(false); Index i = IndexFactory.of(IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)); System.out.println("Collection statistics:"); System.out.println("number of indexed documents: " + i.getCollectionStatistics().getNumberOfDocuments()); System.out.println("size of vocabulary: " + i.getCollectionStatistics().getNumberOfUniqueTerms()); System.out.println("number of tokens: " + i.getCollectionStatistics().getNumberOfTokens()); System.out.println("number of pointers: " + i.getCollectionStatistics().getNumberOfPointers()); try { i.close(); } catch (IOException e) {} return 0; }
/** * Constructor. */ public static MultiStats factory(CollectionStatistics[] stats) { int numDocs = 0, numTerms = 0; long numTokens = 0, numPointers = 0; long[] fieldTokens = new long[] { 0 }; for (CollectionStatistics stat : stats) { numDocs += stat.getNumberOfDocuments(); numTokens += stat.getNumberOfTokens(); numPointers += stat.getNumberOfPointers(); if (stat.getNumberOfUniqueTerms() > numTerms) numTerms = stat.getNumberOfUniqueTerms(); } return new MultiStats(numDocs, numTerms, numTokens, numPointers, fieldTokens); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); this.basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); this.afterEffect.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.normalisation.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.normalisation.setNumberOfTokens(_cs.getNumberOfTokens()); this.normalisation.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.i.setNumberOfDocuments(_cs.getNumberOfDocuments()); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); this.basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); this.afterEffect.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.normalisation.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.normalisation.setNumberOfTokens(_cs.getNumberOfTokens()); this.normalisation.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.i.setNumberOfDocuments(_cs.getNumberOfDocuments()); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); fieldNormalisations = new Normalisation[fieldCount]; fieldGlobalFrequencies = new double[fieldCount]; fieldWeights = new double[fieldCount]; try { for(int fi=0;fi<fieldCount;fi++) { fieldWeights[fi] = Double.parseDouble(ApplicationSetup.getProperty("w."+ fi, ""+1.0)); Normalisation nf = this.fieldNormalisations[fi] = normClass.newInstance(); final double param = Double.parseDouble(ApplicationSetup.getProperty("c."+ fi, ""+1.0)); nf.setParameter(param); nf.setNumberOfDocuments(_cs.getNumberOfDocuments()); final long tokensf = _cs.getFieldTokens()[fi]; nf.setNumberOfTokens(tokensf); nf.setAverageDocumentLength(_cs.getAverageFieldLengths()[fi]); } } catch (Exception e) { throw new IllegalStateException(e); } }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); fieldNormalisations = new Normalisation[fieldCount]; fieldGlobalFrequencies = new double[fieldCount]; fieldWeights = new double[fieldCount]; try { for(int fi=0;fi<fieldCount;fi++) { fieldWeights[fi] = Double.parseDouble(ApplicationSetup.getProperty("w."+ fi, ""+1.0)); Normalisation nf = this.fieldNormalisations[fi] = normClass.newInstance(); final double param = Double.parseDouble(ApplicationSetup.getProperty("c."+ fi, ""+1.0)); nf.setParameter(param); nf.setNumberOfDocuments(_cs.getNumberOfDocuments()); final long tokensf = _cs.getFieldTokens()[fi]; nf.setNumberOfTokens(tokensf); nf.setAverageDocumentLength(_cs.getAverageFieldLengths()[fi]); } } catch (Exception e) { throw new IllegalStateException(e); } }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
@Test public void testWritable() throws Exception { CollectionStatistics cs1 = new CollectionStatistics(5, 6, 7, 8, new long[]{2}); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); cs1.write(dos); dos.flush(); final byte[] bytes = baos.toByteArray(); assertTrue(bytes.length > 0); CollectionStatistics cs2 = new CollectionStatistics(); cs2.readFields(new DataInputStream(new ByteArrayInputStream(bytes))); assertEquals(cs1.getNumberOfDocuments(), cs2.getNumberOfDocuments()); assertEquals(cs1.getNumberOfUniqueTerms(), cs2.getNumberOfUniqueTerms()); assertEquals(cs1.getNumberOfPointers(), cs2.getNumberOfPointers()); assertEquals(cs1.getNumberOfTokens(), cs2.getNumberOfTokens()); assertEquals(cs1.getAverageDocumentLength(), cs2.getAverageDocumentLength(), 0.0d); //TODO: test fields }
protected void checkCollectionStatistics(Index index) { final CollectionStatistics cs = index.getCollectionStatistics(); System.err.println("num docs=" + cs.getNumberOfDocuments()); assertEquals("Number of documents doesn't match", DOCUMENT_LENGTHS.length, cs.getNumberOfDocuments()); assertEquals("Number of tokens doesn't match", StaTools.sum(DOCUMENT_LENGTHS), cs.getNumberOfTokens()); assertEquals("Average document length doesn't match", StaTools.mean(DOCUMENT_LENGTHS), cs.getAverageDocumentLength(), 0.0d); assertEquals("Number of pointers doesnt match", NUMBER_POINTERS, cs.getNumberOfPointers()); assertEquals("Number of unique terms doesn't match", NUMBER_UNIQUE_TERMS, cs.getNumberOfUniqueTerms()); }
assertEquals(1, frInput.getCollectionStatistics().getNumberOfTokens()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfUniqueTerms());