/** * Hook method that creates the right type of DocumentTree class. */ protected void createDocumentPostings(){ if (FieldScore.FIELDS_COUNT > 0) termsInDocument = new FieldDocumentPostingList(FieldScore.FIELDS_COUNT); else termsInDocument = new DocumentPostingList(); }
public void processTerm(String term) { /* null means the term has been filtered out (eg stopwords) */ if (term != null) { //add term to thingy tree termsInDocument.insert(term); numOfTokensInDocument++; } }
/** Return a DocumentIndexEntry for this document */ public DocumentIndexEntry getDocumentStatistics() { DocumentIndexEntry die = new BasicDocumentIndexEntry(); die.setDocumentLength(this.getDocumentLength()); die.setNumberOfEntries(this.getNumberOfPointers()); return die; }
/** * Add the terms in a DocumentPostingList to the postings in memory. * @param docPostings DocumentPostingList containing the term information for the denoted document. * @param docid Current document Identifier. * @throws IOException if an I/O error occurs. */ public void addTerms(DocumentPostingList docPostings, int docid) throws IOException { for (String term : docPostings.termSet()) add(term, docid, docPostings.getFrequency(term)); }
public void write(final DataOutput out) throws IOException { WritableUtils.writeVInt(out, getNumberOfPointers()); try { this.forEachTerm(new TObjectIntProcedure<String>() { public boolean execute(String term, int freq) { try{ Text.writeString(out, term); WritableUtils.writeVInt(out, freq); } catch (IOException e) { throw new Error(e); } return true; } }); } catch (Error e) { throw (IOException)e.getCause(); } }
document.addDocument(docContents.getDocumentLength()); for (String term : docContents.termSet()) { docContents.getFrequency(term))); stats.update(1, docContents.getDocumentLength(), docContents.termSet().length); stats.updateUniqueTerms(lexicon.numberOfEntries());
((MemoryDocumentIndexFields) document).addDocument(docContents.getDocumentLength(), ((FieldDocumentIndexEntry) docContents.getDocumentStatistics()).getFieldLengths()); for (String term : docContents.termSet()) { MemoryFieldsLexiconEntry le = new MemoryFieldsLexiconEntry(1, docContents.getFrequency(term), ((FieldDocumentPostingList)docContents).getFieldFrequencies(term)); ((MemoryFieldsInvertedIndex) inverted).add(termid, stats.getNumberOfDocuments(), docContents.getFrequency(term), ((FieldDocumentPostingList)docContents).getFieldFrequencies(term)); stats.update(1, docContents.getDocumentLength(), docContents.termSet().length); stats.updateUniqueTerms(lexicon.numberOfEntries()); stats.updateFields(fieldcounts);
/** * {@inheritDoc}. * This implementation only places content in the runs in memory, which will eventually be flushed to disk. */ @Override protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception { if (seenDocnos.contains(docProperties.get("docno"))) return; else seenDocnos.add(docProperties.get("docno")); if (termsInDocument.getDocumentLength() > 0) { numberOfDocsSinceCheck++; numberOfDocsSinceFlush++; checkFlush(); mp.addTerms(termsInDocument, currentId); DocumentIndexEntry die = termsInDocument.getDocumentStatistics(); docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die)); metaBuilder.writeDocumentEntry(docProperties); currentId++; numberOfDocuments++; } }
/** * This adds a document to the direct and document indexes, as well * as it's terms to the lexicon. Handled internally by the methods * indexFieldDocument and indexNoFieldDocument. * @param docProperties Map<String,String> properties of the document * @param _termsInDocument DocumentPostingList the terms in the document. * */ protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception { /* add words to lexicontree */ lexiconBuilder.addDocumentTerms(_termsInDocument); /* add doc postings to the direct index */ BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2(termCodes)); /* add doc to documentindex */ DocumentIndexEntry die = _termsInDocument.getDocumentStatistics(); die.setBitIndexPointer(dirIndexPost); docIndexBuilder.addEntryToBuffer(die); /** add doc metadata to index */ metaBuilder.writeDocumentEntry(docProperties); }
if (termsInDocument.getDocumentLength() == 0) break; termsInDocument.clear();
if (termsInDocument.getDocumentLength() == 0) {
@Override public void clear() { super.clear(); Arrays.fill(fieldLengths, 0); }
/** Inserts all the terms from a document posting * into the lexicon map * @param doc The postinglist for that document */ public void insert(DocumentPostingList doc) { doc.forEachTerm(new TObjectIntProcedure<String>() { public boolean execute(final String t, final int tf) { tfs.adjustOrPutValue(t, tf, tf); nts.adjustOrPutValue(t, 1 , 1); if (tf > maxtfs.get(t)) maxtfs.put(t, tf); return true; } }); }
docContents.getDocumentLength() + document.getDocumentLength(docid)); for (String term : docContents.termSet()) { docContents.getFrequency(term))); docContents.getFrequency(term)); if (newPtr) pointers++; stats.update(0, docContents.getDocumentLength(), pointers); stats.updateUniqueTerms(lexicon.numberOfEntries());
/** {@inheritDoc} */ public void addTerms(DocumentPostingList docPostings, int docid) throws IOException{ for (String term : docPostings.termSet()) add(term, docid, docPostings.getFrequency(term), ((FieldDocumentPostingList)docPostings).getFieldFrequencies(term)); }
/** * {@inheritDoc}. * This implementation only places content in the runs in memory, which will eventually be flushed to disk. */ @Override protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception { if (termsInDocument.getDocumentLength() > 0) { numberOfDocsSinceCheck++; numberOfDocsSinceFlush++; checkFlush(); mp.addTerms(termsInDocument, currentId); DocumentIndexEntry die = termsInDocument.getDocumentStatistics(); docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die)); metaBuilder.writeDocumentEntry(docProperties); currentId++; numberOfDocuments++; } }
/** * This adds a document to the direct and document indexes, as well * as it's terms to the lexicon. Handled internally by the methods * indexFieldDocument and indexNoFieldDocument. * @param docProperties Map<String,String> properties of the document * @param _termsInDocument DocumentPostingList the terms in the document. * */ protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception { /* add words to lexicontree */ lexiconBuilder.addDocumentTerms(_termsInDocument); /* add doc postings to the direct index */ BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2(termCodes)); //.addDocument(termsInDocument.getPostings()); /* add doc to documentindex */ DocumentIndexEntry die = _termsInDocument.getDocumentStatistics(); die.setBitIndexPointer(dirIndexPost); docIndexBuilder.addEntryToBuffer(die); /** add doc metadata to index */ metaBuilder.writeDocumentEntry(docProperties); }
if (termsInDocument.getDocumentLength() == 0) break; termsInDocument.clear();
if (termsInDocument.getDocumentLength() == 0)