@Override public void prepare() { super.prepare(); //these statistics are as used by Ivory system, of which Don Metzler was one of the authors defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d; defaultCf = defaultDf * 2; }
/** Create a new Matching instance based on the specified index */ public Full(Index index) { super(index); if (this.getClass() == Full.class) { logger.warn(this.getClass().getName() + " is not suitable for indices with large numbers of documents (> "+WARN_DOCS+") " +"- consider using org.terrier.matching.daat.Full"); } resultSet = new AccumulatorResultSet(collectionStatistics.getNumberOfDocuments()); }
/** Returns the last docid in this index **/ public int getEnd() { return this.getCollectionStatistics().getNumberOfDocuments()-1; } }
@Override public int getNumberOfDocuments() { return index.getCollectionStatistics().getNumberOfDocuments(); }
@Override public int getNumberOfDocuments() { return index.getCollectionStatistics().getNumberOfDocuments(); }
@Override public int getNumberOfDocuments() { return index.getCollectionStatistics().getNumberOfDocuments(); }
/** Returns the last docid in this index **/ public int getEnd() { return this.getCollectionStatistics().getNumberOfDocuments()-1; } }
/** Sets the collection statistics used to score the documents (number of documents in the collection, etc)*/ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { numTokens = (double)cs.getNumberOfTokens(); long numDocs = (long) (cs.getNumberOfDocuments()); avgDocLen = ((double) (numTokens - numDocs * (ngramLength - 1))) / (double) numDocs; } /** Calculate the score for a document (from the given posting for that document)*/
/** Sets the collection statistics used to score the documents (number of documents in the collection, etc)*/ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { numTokens = (double)cs.getNumberOfTokens(); long numDocs = (long) (cs.getNumberOfDocuments()); avgDocLen = ((double) (numTokens - numDocs * (ngramLength - 1))) / (double) numDocs; } /** Calculate the score for a document (from the given posting for that document)*/
/** * {@inheritDoc} */ @Override public void setCollectionStatistics(CollectionStatistics cs, Index _index) { super.setCollectionStatistics(cs, _index); w_o = Double.parseDouble(ApplicationSetup.getProperty("proximity."+super.ngramLength+".w_o", ApplicationSetup.getProperty("proximity.w_o", "1.0d"))); //these statistics are as used by Ivory system defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d; defaultCf = defaultDf * 2; }
/** * {@inheritDoc} */ @Override public void setCollectionStatistics(CollectionStatistics cs, Index _index) { super.setCollectionStatistics(cs, _index); w_o = Double.parseDouble(ApplicationSetup.getProperty("proximity."+super.ngramLength+".w_o", ApplicationSetup.getProperty("proximity.w_o", "1.0d"))); //these statistics are as used by Ivory system defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d; defaultCf = defaultDf * 2; }
/** Returns a concrete representation of an index's statistics */ public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n"; }
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); this.basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); this.afterEffect.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.normalisation.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.normalisation.setNumberOfTokens(_cs.getNumberOfTokens()); this.normalisation.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.i.setNumberOfDocuments(_cs.getNumberOfDocuments()); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); this.basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); this.afterEffect.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.normalisation.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.normalisation.setNumberOfTokens(_cs.getNumberOfTokens()); this.normalisation.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.i.setNumberOfDocuments(_cs.getNumberOfDocuments()); }
void _testIndexTrailingEmptyDocument(Class<? extends Indexer> clz) throws Exception { Index index = IndexTestUtils.makeIndex( new String[]{"doc1", "doc2", "doc3"}, new String[]{"test document", "another test document", "" /* empty doc */}, clz); assertEquals(3, index.getCollectionStatistics().getNumberOfDocuments()); }
@Test public void test() throws Exception { PrintWriter p = new PrintWriter(Files.writeFileWriter(ApplicationSetup.COLLECTION_SPEC)); p.println(System.getProperty("user.dir") + "/../../share/tests/shakespeare/shakespeare-merchant.trec.1"); p.println(System.getProperty("user.dir") + "/../../share/tests/shakespeare/shakespeare-merchant.trec.2"); p.close(); CLITool.main(new String[]{"batchindexing", "-b"}); Index index = Index.createIndex(); assertNotNull(index); assertEquals(22, index.getCollectionStatistics().getNumberOfDocuments()); assertTrue( index.getDirectIndex().getPostings(index.getDocumentIndex().getDocumentEntry(0)) instanceof BlockPosting ); }
@Test public void testParallel() throws Exception { PrintWriter p = new PrintWriter(Files.writeFileWriter(ApplicationSetup.COLLECTION_SPEC)); p.println(System.getProperty("user.dir") + "/../../share/tests/shakespeare/shakespeare-merchant.trec.1"); p.println(System.getProperty("user.dir") + "/../../share/tests/shakespeare/shakespeare-merchant.trec.2"); p.close(); CLITool.main(new String[]{"batchindexing", "-b", "-p"}); Index index = Index.createIndex(); assertNotNull(index); assertEquals(22, index.getCollectionStatistics().getNumberOfDocuments()); assertTrue( index.getDirectIndex().getPostings(index.getDocumentIndex().getDocumentEntry(0)) instanceof BlockPosting ); }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
protected void checkCollectionStatistics(Index index) { final CollectionStatistics cs = index.getCollectionStatistics(); System.err.println("num docs=" + cs.getNumberOfDocuments()); assertEquals("Number of documents doesn't match", DOCUMENT_LENGTHS.length, cs.getNumberOfDocuments()); assertEquals("Number of tokens doesn't match", StaTools.sum(DOCUMENT_LENGTHS), cs.getNumberOfTokens()); assertEquals("Average document length doesn't match", StaTools.mean(DOCUMENT_LENGTHS), cs.getAverageDocumentLength(), 0.0d); assertEquals("Number of pointers doesnt match", NUMBER_POINTERS, cs.getNumberOfPointers()); assertEquals("Number of unique terms doesn't match", NUMBER_UNIQUE_TERMS, cs.getNumberOfUniqueTerms()); }