org.terrier.structures.indexing.DocumentPostingList java code examples

/**
 * Hook method that creates the right type of DocumentTree class.
 */
protected void createDocumentPostings(){
  if (FieldScore.FIELDS_COUNT > 0)
    termsInDocument = new FieldDocumentPostingList(FieldScore.FIELDS_COUNT);
  else
    termsInDocument = new DocumentPostingList();		
}

public void processTerm(String term)
{
  /* null means the term has been filtered out (eg stopwords) */
  if (term != null)
  {
    //add term to thingy tree
    termsInDocument.insert(term);
    numOfTokensInDocument++;
  }
}

/** Return a DocumentIndexEntry for this document */ 
public DocumentIndexEntry getDocumentStatistics()
{
  DocumentIndexEntry die = new BasicDocumentIndexEntry();
  die.setDocumentLength(this.getDocumentLength());
  die.setNumberOfEntries(this.getNumberOfPointers());
  return die;
}

/**
 * Add the terms in a DocumentPostingList to the postings in memory.
 * @param docPostings DocumentPostingList containing the term information for the denoted document.
 * @param docid Current document Identifier. 
 * @throws IOException if an I/O error occurs.
 */
public void addTerms(DocumentPostingList docPostings, int docid) throws IOException {
  for (String term : docPostings.termSet())
    add(term, docid, docPostings.getFrequency(term));
}

public void write(final DataOutput out) throws IOException {
  WritableUtils.writeVInt(out, getNumberOfPointers());
  try
  {
    this.forEachTerm(new TObjectIntProcedure<String>()
    {
      public boolean execute(String term, int freq) {
        try{
        Text.writeString(out, term);
        WritableUtils.writeVInt(out, freq);
        } catch (IOException e) {
          throw new Error(e);
        }
        return true;
      }
    });
  } catch (Error e) {
    throw (IOException)e.getCause();
  }
}

document.addDocument(docContents.getDocumentLength());
for (String term : docContents.termSet()) {
      docContents.getFrequency(term)));
stats.update(1, docContents.getDocumentLength(),
    docContents.termSet().length);
stats.updateUniqueTerms(lexicon.numberOfEntries());

((MemoryDocumentIndexFields) document).addDocument(docContents.getDocumentLength(), ((FieldDocumentIndexEntry) docContents.getDocumentStatistics()).getFieldLengths());
for (String term : docContents.termSet()) {
  MemoryFieldsLexiconEntry le = new MemoryFieldsLexiconEntry(1, docContents.getFrequency(term), ((FieldDocumentPostingList)docContents).getFieldFrequencies(term));
  ((MemoryFieldsInvertedIndex) inverted).add(termid, stats.getNumberOfDocuments(), docContents.getFrequency(term), ((FieldDocumentPostingList)docContents).getFieldFrequencies(term));
stats.update(1, docContents.getDocumentLength(), docContents.termSet().length);
stats.updateUniqueTerms(lexicon.numberOfEntries());
stats.updateFields(fieldcounts);

/**
 * {@inheritDoc}.
 * This implementation only places content in the runs in memory, which will eventually be flushed to disk.
 */
@Override
protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception
{
  if (seenDocnos.contains(docProperties.get("docno"))) return;
  else seenDocnos.add(docProperties.get("docno"));
  
  if (termsInDocument.getDocumentLength() > 0) {
    numberOfDocsSinceCheck++;
    numberOfDocsSinceFlush++;
    
    checkFlush();
    mp.addTerms(termsInDocument, currentId);
    DocumentIndexEntry die = termsInDocument.getDocumentStatistics();
    docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die));
    metaBuilder.writeDocumentEntry(docProperties);
    currentId++;
    numberOfDocuments++;
  }
}

public void readFields(DataInput in) throws IOException {
  clear();
  final int termCount = WritableUtils.readVInt(in);
  for(int i=0;i<termCount;i++)
  {
    String term = Text.readString(in);
    int freq = WritableUtils.readVInt(in);
    insert(freq, term);
  }
}

/** 
 * This adds a document to the direct and document indexes, as well 
 * as it's terms to the lexicon. Handled internally by the methods 
 * indexFieldDocument and indexNoFieldDocument.
 * @param docProperties Map&lt;String,String&gt; properties of the document
 * @param _termsInDocument DocumentPostingList the terms in the document.
 * 
 */
protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception
{
  /* add words to lexicontree */
  lexiconBuilder.addDocumentTerms(_termsInDocument);
  /* add doc postings to the direct index */
  BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2(termCodes));
  /* add doc to documentindex */
  DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
  die.setBitIndexPointer(dirIndexPost);
  docIndexBuilder.addEntryToBuffer(die);
  /** add doc metadata to index */
  metaBuilder.writeDocumentEntry(docProperties);
}

  if (termsInDocument.getDocumentLength() == 0)
  break;
termsInDocument.clear();

if (termsInDocument.getDocumentLength() == 0) {

@Override
public void clear() {
  super.clear();
  Arrays.fill(fieldLengths, 0);
}

/** Inserts all the terms from a document posting
 * into the lexicon map
 * @param doc The postinglist for that document
 */
public void insert(DocumentPostingList doc)
{
  doc.forEachTerm(new TObjectIntProcedure<String>() {
    public boolean execute(final String t, final int tf)
    {
      tfs.adjustOrPutValue(t, tf, tf);
      nts.adjustOrPutValue(t, 1 , 1);
      if (tf > maxtfs.get(t))
        maxtfs.put(t, tf);
      return true;
    }
  });    
}

    docContents.getDocumentLength() + document.getDocumentLength(docid));
for (String term : docContents.termSet()) {
      docContents.getFrequency(term)));
      docContents.getFrequency(term));
  if (newPtr) pointers++;
stats.update(0, docContents.getDocumentLength(),
    pointers);
stats.updateUniqueTerms(lexicon.numberOfEntries());

/** {@inheritDoc} */
public void addTerms(DocumentPostingList docPostings, int docid) throws IOException{
  for (String term : docPostings.termSet())
    add(term, docid, docPostings.getFrequency(term), ((FieldDocumentPostingList)docPostings).getFieldFrequencies(term));
}

/**
 * {@inheritDoc}.
 * This implementation only places content in the runs in memory, which will eventually be flushed to disk.
 */
@Override
protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception
{
  if (termsInDocument.getDocumentLength() > 0) {
    numberOfDocsSinceCheck++;
    numberOfDocsSinceFlush++;
    
    checkFlush();
    mp.addTerms(termsInDocument, currentId);
    DocumentIndexEntry die = termsInDocument.getDocumentStatistics();
    docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die));
    metaBuilder.writeDocumentEntry(docProperties);
    currentId++;
    numberOfDocuments++;
  }
}

/** 
 * This adds a document to the direct and document indexes, as well 
 * as it's terms to the lexicon. Handled internally by the methods 
 * indexFieldDocument and indexNoFieldDocument.
 * @param docProperties Map&lt;String,String&gt; properties of the document
 * @param _termsInDocument DocumentPostingList the terms in the document.
 * 
 */
protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception 
{
  /* add words to lexicontree */
  lexiconBuilder.addDocumentTerms(_termsInDocument);
  /* add doc postings to the direct index */
  BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2(termCodes));
    //.addDocument(termsInDocument.getPostings());
  /* add doc to documentindex */
  DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
  die.setBitIndexPointer(dirIndexPost);
  docIndexBuilder.addEntryToBuffer(die);
  /** add doc metadata to index */
  metaBuilder.writeDocumentEntry(docProperties);        
}

  if (termsInDocument.getDocumentLength() == 0)
  break;
termsInDocument.clear();

if (termsInDocument.getDocumentLength() == 0)

Javadoc

Represents the postings of one document. Uses HashMaps internally.

Properties:

indexing.avg.unique.terms.per.doc - number of unique terms per doc on average, used to tune the initial size of the hashmaps used in this class.

Most used methods

<init>
Create a new DocumentPostingList object
insert
Insert a term into the posting list of this document
getDocumentLength
Returns the total number of tokens in this document
getDocumentStatistics
Return a DocumentIndexEntry for this document
getFrequency
Return the frequency of the specified term in this document
termSet
Returns all terms in this posting list
clear
Removes all postings from this document
forEachTerm
Execute the specifed method for each term.
getNumberOfPointers
Returns the number of unique terms in this document.
getPostings2
Returns a posting iterator suitable to be written into the direct index. During this, TermIds are as
makePostingIterator

makePostingIterator

Popular in Java

Finding current android device location
getContentResolver (Context)
setRequestProperty (URLConnection)
requestLocationUpdates (LocationManager)
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Executor (java.util.concurrent)
An object that executes submitted Runnable tasks. This interface provides a way of decoupling task s
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
Graphics2D (java.awt)
This Graphics2D class extends the Graphics class to provide more sophisticated control overgraphics
JTable (javax.swing)
CodeWhisperer alternatives

How to useDocumentPostingList in org.terrier.structures.indexing

Best Java code snippets using org.terrier.structures.indexing.DocumentPostingList (Showing top 20 results out of 315)

How to use
DocumentPostingList
in
org.terrier.structures.indexing