/** * Constructor. */ public static MultiStats factory(CollectionStatistics[] stats) { int numDocs = 0, numTerms = 0; long numTokens = 0, numPointers = 0; long[] fieldTokens = new long[] { 0 }; for (CollectionStatistics stat : stats) { numDocs += stat.getNumberOfDocuments(); numTokens += stat.getNumberOfTokens(); numPointers += stat.getNumberOfPointers(); if (stat.getNumberOfUniqueTerms() > numTerms) numTerms = stat.getNumberOfUniqueTerms(); } return new MultiStats(numDocs, numTerms, numTokens, numPointers, fieldTokens); }
/** Returns a concrete representation of an index's statistics */ public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n"; }
/** Returns a concrete representation of an index's statistics */ public String toString() { return "Number of documents: " + getNumberOfDocuments() + "\n" + "Number of terms: " + getNumberOfUniqueTerms() + "\n" + "Number of fields: " + getNumberOfFields() + "\n" + "Number of tokens: " + getNumberOfTokens() + "\n"; }
@SuppressWarnings("unchecked") public PostingIndex<?> getDirectIndex() { int ondisk = indices.size(); int[] offsets = new int[ondisk]; PostingIndex<?>[] postings = new PostingIndex[ondisk]; int i = 0; for (Index index : selectiveMatchingPolicy.getSelectedIndices(indices)) { postings[i] = index.getDirectIndex(); offsets[i] = index.getCollectionStatistics().getNumberOfUniqueTerms(); i++; } return new MultiDirect((PostingIndex<Pointer>[]) postings, offsets); }
/** {@inheritDoc} */ @SuppressWarnings("unchecked") public Lexicon<String> getLexicon() { int indexCount = indices.size(); int[] offsets = new int[indexCount]; Lexicon<String>[] lexicons = new Lexicon[indexCount]; int i = 0; for (Index index : selectiveMatchingPolicy.getSelectedIndices(indices)) { lexicons[i] = index.getLexicon(); offsets[i] = index.getCollectionStatistics() .getNumberOfUniqueTerms(); i++; } return new MultiLexicon(lexicons, offsets); }
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
logger.info("Started building the inverted index..."); if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0)
if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0)
@SuppressWarnings("deprecation") @Override public int run(String[] args) { Index.setIndexLoadingProfileAsRetrieval(false); Index i = IndexFactory.of(IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)); System.out.println("Collection statistics:"); System.out.println("number of indexed documents: " + i.getCollectionStatistics().getNumberOfDocuments()); System.out.println("size of vocabulary: " + i.getCollectionStatistics().getNumberOfUniqueTerms()); System.out.println("number of tokens: " + i.getCollectionStatistics().getNumberOfTokens()); System.out.println("number of pointers: " + i.getCollectionStatistics().getNumberOfPointers()); try { i.close(); } catch (IOException e) {} return 0; }
@SuppressWarnings("deprecation") @Override public int run(String[] args) { Index.setIndexLoadingProfileAsRetrieval(false); Index i = IndexFactory.of(IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)); System.out.println("Collection statistics:"); System.out.println("number of indexed documents: " + i.getCollectionStatistics().getNumberOfDocuments()); System.out.println("size of vocabulary: " + i.getCollectionStatistics().getNumberOfUniqueTerms()); System.out.println("number of tokens: " + i.getCollectionStatistics().getNumberOfTokens()); System.out.println("number of pointers: " + i.getCollectionStatistics().getNumberOfPointers()); try { i.close(); } catch (IOException e) {} return 0; }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); int fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); long tokens = 0; final long[] tokensf = _cs.getFieldTokens(); for(int fieldId : activeFieldIds) { tokens += tokensf[fieldId]; } super.numberOfTokens = tokens; super.averageDocumentLength = (double)tokens / (double)_cs.getNumberOfDocuments(); basicModel.setCollectionStatistics( new CollectionStatistics(_cs.getNumberOfDocuments(), _cs.getNumberOfUniqueTerms(), tokens, _cs.getNumberOfPointers(), new long[0])); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); int fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); long tokens = 0; final long[] tokensf = _cs.getFieldTokens(); for(int fieldId : activeFieldIds) { tokens += tokensf[fieldId]; } super.numberOfTokens = tokens; super.averageDocumentLength = (double)tokens / (double)_cs.getNumberOfDocuments(); basicModel.setCollectionStatistics( new CollectionStatistics(_cs.getNumberOfDocuments(), _cs.getNumberOfUniqueTerms(), tokens, _cs.getNumberOfPointers(), new long[0])); }
if (halt) return; int numTerms = ((Index)index).getCollectionStatistics().getNumberOfUniqueTerms(); Lexicon<String> lex = ((Index)index).getLexicon();
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
@Test public void testWritable() throws Exception { CollectionStatistics cs1 = new CollectionStatistics(5, 6, 7, 8, new long[]{2}); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); cs1.write(dos); dos.flush(); final byte[] bytes = baos.toByteArray(); assertTrue(bytes.length > 0); CollectionStatistics cs2 = new CollectionStatistics(); cs2.readFields(new DataInputStream(new ByteArrayInputStream(bytes))); assertEquals(cs1.getNumberOfDocuments(), cs2.getNumberOfDocuments()); assertEquals(cs1.getNumberOfUniqueTerms(), cs2.getNumberOfUniqueTerms()); assertEquals(cs1.getNumberOfPointers(), cs2.getNumberOfPointers()); assertEquals(cs1.getNumberOfTokens(), cs2.getNumberOfTokens()); assertEquals(cs1.getAverageDocumentLength(), cs2.getAverageDocumentLength(), 0.0d); //TODO: test fields }
protected void checkCollectionStatistics(Index index) { final CollectionStatistics cs = index.getCollectionStatistics(); System.err.println("num docs=" + cs.getNumberOfDocuments()); assertEquals("Number of documents doesn't match", DOCUMENT_LENGTHS.length, cs.getNumberOfDocuments()); assertEquals("Number of tokens doesn't match", StaTools.sum(DOCUMENT_LENGTHS), cs.getNumberOfTokens()); assertEquals("Average document length doesn't match", StaTools.mean(DOCUMENT_LENGTHS), cs.getAverageDocumentLength(), 0.0d); assertEquals("Number of pointers doesnt match", NUMBER_POINTERS, cs.getNumberOfPointers()); assertEquals("Number of unique terms doesn't match", NUMBER_UNIQUE_TERMS, cs.getNumberOfUniqueTerms()); }
checkInvertedIndexStream(index, DOCUMENT_LENGTHS, DOCUMENT_UNIQUE_TERMS); checkDirectIndex(index, index.getCollectionStatistics().getNumberOfUniqueTerms(), index.getCollectionStatistics().getNumberOfUniqueTerms(), DOCUMENT_LENGTHS, DOCUMENT_UNIQUE_TERMS, true);
assertEquals(1, frInput.getCollectionStatistics().getNumberOfPointers()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfTokens()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfUniqueTerms());