/** * Constructs an instance of ExpansionTerms. * @param collStats Statistics of the used corpora * @param _lexicon Lexicon The lexicon used for retrieval. * @param _directIndex DirectIndex to use for finding terms for documents * @param _documentIndex DocumentIndex to use for finding statistics about documents */ public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon<String> _lexicon, PostingIndex<?> _directIndex, DocumentIndex _documentIndex) { this.numberOfDocuments = collStats.getNumberOfDocuments(); this.numberOfTokens = collStats.getNumberOfTokens(); this.averageDocumentLength = collStats.getAverageDocumentLength(); this.terms = new TIntObjectHashMap<ExpansionTerm>(); this.totalDocumentLength = 0; this.lexicon = _lexicon; this.documentIndex = _documentIndex; this.directIndex = _directIndex; }
/** * Constructs an instance of ExpansionTerms. * @param collStats Statistics of the used corpora * @param _lexicon Lexicon The lexicon used for retrieval. * @param _directIndex DirectIndex to use for finding terms for documents * @param _documentIndex DocumentIndex to use for finding statistics about documents */ public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon<String> _lexicon, PostingIndex<?> _directIndex, DocumentIndex _documentIndex) { this.numberOfDocuments = collStats.getNumberOfDocuments(); this.numberOfTokens = collStats.getNumberOfTokens(); this.averageDocumentLength = collStats.getAverageDocumentLength(); this.terms = new TIntObjectHashMap<ExpansionTerm>(); this.totalDocumentLength = 0; this.lexicon = _lexicon; this.documentIndex = _documentIndex; this.directIndex = _directIndex; }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); this.basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); this.afterEffect.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.normalisation.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.normalisation.setNumberOfTokens(_cs.getNumberOfTokens()); this.normalisation.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.i.setNumberOfDocuments(_cs.getNumberOfDocuments()); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); this.basicModel.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.basicModel.setNumberOfTokens(_cs.getNumberOfTokens()); this.afterEffect.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.normalisation.setNumberOfDocuments(_cs.getNumberOfDocuments()); this.normalisation.setNumberOfTokens(_cs.getNumberOfTokens()); this.normalisation.setAverageDocumentLength(_cs.getAverageDocumentLength()); this.i.setNumberOfDocuments(_cs.getNumberOfDocuments()); }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
@Test public void testWritable() throws Exception { CollectionStatistics cs1 = new CollectionStatistics(5, 6, 7, 8, new long[]{2}); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); cs1.write(dos); dos.flush(); final byte[] bytes = baos.toByteArray(); assertTrue(bytes.length > 0); CollectionStatistics cs2 = new CollectionStatistics(); cs2.readFields(new DataInputStream(new ByteArrayInputStream(bytes))); assertEquals(cs1.getNumberOfDocuments(), cs2.getNumberOfDocuments()); assertEquals(cs1.getNumberOfUniqueTerms(), cs2.getNumberOfUniqueTerms()); assertEquals(cs1.getNumberOfPointers(), cs2.getNumberOfPointers()); assertEquals(cs1.getNumberOfTokens(), cs2.getNumberOfTokens()); assertEquals(cs1.getAverageDocumentLength(), cs2.getAverageDocumentLength(), 0.0d); //TODO: test fields }
protected void checkCollectionStatistics(Index index) { final CollectionStatistics cs = index.getCollectionStatistics(); System.err.println("num docs=" + cs.getNumberOfDocuments()); assertEquals("Number of documents doesn't match", DOCUMENT_LENGTHS.length, cs.getNumberOfDocuments()); assertEquals("Number of tokens doesn't match", StaTools.sum(DOCUMENT_LENGTHS), cs.getNumberOfTokens()); assertEquals("Average document length doesn't match", StaTools.mean(DOCUMENT_LENGTHS), cs.getAverageDocumentLength(), 0.0d); assertEquals("Number of pointers doesnt match", NUMBER_POINTERS, cs.getNumberOfPointers()); assertEquals("Number of unique terms doesn't match", NUMBER_UNIQUE_TERMS, cs.getNumberOfUniqueTerms()); }
assertEquals(4, cs.getNumberOfUniqueTerms()); assertEquals(6l, cs.getNumberOfPointers()); assertEquals(4.0d, cs.getAverageDocumentLength(), 0.0d);
assertEquals(4, cs.getNumberOfUniqueTerms()); assertEquals(6l, cs.getNumberOfPointers()); assertEquals(4.0d, cs.getAverageDocumentLength(), 0.0d);
assertEquals(stats1.getAverageDocumentLength(), stats2.getAverageDocumentLength(), 0.0d);