/** * Constructor. */ public static MultiStats factory(CollectionStatistics[] stats) { int numDocs = 0, numTerms = 0; long numTokens = 0, numPointers = 0; long[] fieldTokens = new long[] { 0 }; for (CollectionStatistics stat : stats) { numDocs += stat.getNumberOfDocuments(); numTokens += stat.getNumberOfTokens(); numPointers += stat.getNumberOfPointers(); if (stat.getNumberOfUniqueTerms() > numTerms) numTerms = stat.getNumberOfUniqueTerms(); } return new MultiStats(numDocs, numTerms, numTokens, numPointers, fieldTokens); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); int fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); long tokens = 0; final long[] tokensf = _cs.getFieldTokens(); for(int fieldId : activeFieldIds) { tokens += tokensf[fieldId]; } super.numberOfTokens = tokens; super.averageDocumentLength = (double)tokens / (double)_cs.getNumberOfDocuments(); basicModel.setCollectionStatistics( new CollectionStatistics(_cs.getNumberOfDocuments(), _cs.getNumberOfUniqueTerms(), tokens, _cs.getNumberOfPointers(), new long[0])); }
/** * Constructs an instance of ExpansionTerms. * @param collStats Statistics of the used corpora * @param _lexicon Lexicon The lexicon used for retrieval. * @param _directIndex DirectIndex to use for finding terms for documents * @param _documentIndex DocumentIndex to use for finding statistics about documents */ public DFRBagExpansionTerms(CollectionStatistics collStats, Lexicon<String> _lexicon, PostingIndex<?> _directIndex, DocumentIndex _documentIndex) { this.numberOfDocuments = collStats.getNumberOfDocuments(); this.numberOfTokens = collStats.getNumberOfTokens(); this.averageDocumentLength = collStats.getAverageDocumentLength(); this.terms = new TIntObjectHashMap<ExpansionTerm>(); this.totalDocumentLength = 0; this.lexicon = _lexicon; this.documentIndex = _documentIndex; this.directIndex = _directIndex; }
/** Returns a concrete representation of an index's statistics */ public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n"; }
/** Sets the collection statistics used to score the documents (number of documents in the collection, etc)*/ public void setCollectionStatistics(CollectionStatistics cs, Index _index) { numTokens = (double)cs.getNumberOfTokens(); long numDocs = (long) (cs.getNumberOfDocuments()); avgDocLen = ((double) (numTokens - numDocs * (ngramLength - 1))) / (double) numDocs; } /** Calculate the score for a document (from the given posting for that document)*/
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
@Test public void testWritable() throws Exception { CollectionStatistics cs1 = new CollectionStatistics(5, 6, 7, 8, new long[]{2}); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); cs1.write(dos); dos.flush(); final byte[] bytes = baos.toByteArray(); assertTrue(bytes.length > 0); CollectionStatistics cs2 = new CollectionStatistics(); cs2.readFields(new DataInputStream(new ByteArrayInputStream(bytes))); assertEquals(cs1.getNumberOfDocuments(), cs2.getNumberOfDocuments()); assertEquals(cs1.getNumberOfUniqueTerms(), cs2.getNumberOfUniqueTerms()); assertEquals(cs1.getNumberOfPointers(), cs2.getNumberOfPointers()); assertEquals(cs1.getNumberOfTokens(), cs2.getNumberOfTokens()); assertEquals(cs1.getAverageDocumentLength(), cs2.getAverageDocumentLength(), 0.0d); //TODO: test fields }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); fieldCount = _cs.getNumberOfFields(); p = new double[fieldCount]; fieldWeights = new double[fieldCount]; this.fieldNormalisations = new Normalisation[fieldCount]; try{ for(int fi=0;fi<fieldCount;fi++) { final Normalisation nf = this.fieldNormalisations[fi] = normClass.newInstance(); final double param = Double.parseDouble(ApplicationSetup.getProperty("c."+ fi, ""+1.0)); nf.setParameter(param); nf.setNumberOfDocuments(_cs.getNumberOfDocuments()); final long tokensf = _cs.getFieldTokens()[fi]; nf.setNumberOfTokens(tokensf); nf.setAverageDocumentLength(_cs.getAverageFieldLengths()[fi]); p[fi] = 1.0d / ((double)fieldCount * (double) _cs.getNumberOfDocuments()); p[fi] = p[fi] / (fieldWeights[fi] = Double.parseDouble( ApplicationSetup.getProperty("p." + fi, "1.0d"))); } } catch (Exception e) { throw new IllegalArgumentException(e); } }
@Override public void prepare() { super.prepare(); //these statistics are as used by Ivory system, of which Don Metzler was one of the authors defaultDf = ((double) cs.getNumberOfDocuments()) / 100.0d; defaultCf = defaultDf * 2; }
CollectionStatistics collStats = new CollectionStatistics(); collStats.readFields(in); frs.setCollectionStatistics(collStats); final boolean fields = collStats.getNumberOfFields() > 0; final int fieldCount = collStats.getNumberOfFields();
logger.info("Started building the inverted index..."); if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0) if (currentIndex.getCollectionStatistics().getNumberOfDocuments() == 0)
assertEquals("Number of documents is incorrect", cs.getNumberOfDocuments(), docid); assertEquals("Number of pointers is incorrect", cs.getNumberOfPointers(), pointers); assertEquals("Number of tokens is incorrect", cs.getNumberOfTokens(), tokens); if (numberOfTerms > 0)
assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments()); assertEquals(2l, index.getCollectionStatistics().getNumberOfTokens()); assertEquals(2l, index.getCollectionStatistics().getNumberOfUniqueTerms()); assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments()); assertEquals(5l, index.getCollectionStatistics().getNumberOfTokens()); assertEquals(4l, index.getCollectionStatistics().getNumberOfUniqueTerms());
@SuppressWarnings("unchecked") public PostingIndex<?> getDirectIndex() { int ondisk = indices.size(); int[] offsets = new int[ondisk]; PostingIndex<?>[] postings = new PostingIndex[ondisk]; int i = 0; for (Index index : selectiveMatchingPolicy.getSelectedIndices(indices)) { postings[i] = index.getDirectIndex(); offsets[i] = index.getCollectionStatistics().getNumberOfUniqueTerms(); i++; } return new MultiDirect((PostingIndex<Pointer>[]) postings, offsets); }
valueFactoryClass); TerrierTimer tt = new TerrierTimer("Recompressing inverted index", index.getCollectionStatistics().getNumberOfPointers()); tt.start(); try{
System.err.println("_testSingleDocumentIndexMatchingFields: " + index.toString()); assertNotNull(index); assertEquals(1, index.getCollectionStatistics().getNumberOfDocuments()); assertEquals(2, index.getCollectionStatistics().getNumberOfFields()); assertEquals(2, index.getCollectionStatistics().getFieldTokens()[0]); assertEquals(7, index.getCollectionStatistics().getFieldTokens()[1]); assertEquals(9, index.getDocumentIndex().getDocumentLength(0)); Matching matching = makeMatching(index); assertEquals( index.getCollectionStatistics().getNumberOfDocuments(), fatIndex.getCollectionStatistics().getNumberOfDocuments()); assertEquals(index.getCollectionStatistics().getNumberOfFields(), fatIndex.getCollectionStatistics().getNumberOfFields()); assertEquals(index.getCollectionStatistics().getFieldTokens()[0], fatIndex.getCollectionStatistics().getFieldTokens()[0]); assertEquals(index.getCollectionStatistics().getFieldTokens()[1], fatIndex.getCollectionStatistics().getFieldTokens()[1]);
collStats.write(out); final int fieldCount = collStats.getNumberOfFields(); final int queryTermCount = queryTerms.length; final boolean fields[] = new boolean[queryTermCount];
public FatFull(Index index) { super(index); fieldCount = super.collectionStatistics.getNumberOfFields(); }
assertEquals(123, index.getIntIndexProperty("num.field.0.Tokens", -1)); assertEquals(611, index.getIntIndexProperty("num.field.1.Tokens", -1)); assertEquals(2, index.getCollectionStatistics().getNumberOfFields()); assertEquals(123, index.getCollectionStatistics().getFieldTokens()[0]); assertEquals(611, index.getCollectionStatistics().getFieldTokens()[1]);
/** * for an immutable index, use a normal collection statistics, never changes */ protected void loadStatistics() { // calculate fields int fieldCount = 0; if (this.hasIndexStructure("inverted")) { fieldCount = Integer.parseInt(properties.getProperty( "index.inverted.fields.count", "0")); } else if (this.hasIndexStructure("direct")) { fieldCount = Integer.parseInt(properties.getProperty( "index.direct.fields.count", "0")); } final long[] tokensF = new long[fieldCount]; for (int fi = 0; fi < fieldCount; fi++) { tokensF[fi] = Long.parseLong(properties.getProperty("num.field." + fi + ".Tokens", "0")); } // create collection statistics structureCache.put( "collectionstatistics", new CollectionStatistics(Integer.parseInt(properties .getProperty("num.Documents", "0")), Integer .parseInt(properties.getProperty("num.Terms", "0")), Long.parseLong(properties .getProperty("num.Tokens", "0")), Long .parseLong(properties.getProperty( "num.Pointers", "0")), tokensF)); }