org.terrier.structures.indexing.Indexer java code examples

/** This method must be called by anything which directly extends Indexer.
 * See: http://benpryor.com/blog/2008/01/02/dont-call-subclass-methods-from-a-superclass-constructor/
 */
protected void init()
{
  FieldScore.init();
  //init fields before constructing pipeline
  this.load_field_ids();
  //construct pipeline using list specified in terrier.properties
  //this object should be the last item in the pipeline
  this.load_indexer_properties();
  this.load_pipeline();
  //load the docnos of any documents that should force builder boundaries
  this.load_builder_boundary_documents();
  
}

/** Utility method for merging indices */
public static void main(String args[]) throws Exception
{
  if (args[0].equals("--merge") && args.length == 3)
  {
    merge(
      ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX,
      Integer.parseInt(args[1]), Integer.parseInt(args[2])
    );
    return;
  }
  logger.error("Usage: org.terrier.indexing.Indexer --merge [lowid] [highid]");
}

/**
 * Calls the method index(Collection[]) of the
 * class Indexer in order to build the data
 * structures for a set of collections. This 
 * particular method of the Indexer uses a 
 * set of builders for a subset of the collection
 * and builds separate data structures, which are 
 * later merged.
 */
@Override
public void index() {
  if (Index.existsIndex(path, prefix))
  {
    logger.error("Cannot index while an index exists at "+path + ","+ prefix);
    return;
  }        
  loadIndexer(path, prefix).index(new Collection[] {collectionTREC});
  try{
    collectionTREC.close();
  } catch (Exception e) {
    logger.warn("problem closing collection", e);
  }
}

this.createDirectIndex(collections);
this.createInvertedIndex();
merge(path, oldIndexPrefix, 1, counter);

protected Index doIndexing(Indexer indexer, boolean fieldsExpected,	Document[] sourceDocs) {
  Collection col = new CollectionDocumentList(sourceDocs, "filename");
  indexer.createDirectIndex(new Collection[] { col });
  indexer.createInvertedIndex();
  return ((MemoryIndexer) indexer).getIndex();
}

  @Override
  protected void load_indexer_properties() {
    super.load_indexer_properties();
    BLOCK_SIZE = ApplicationSetup.BLOCK_SIZE;
    MAX_BLOCKS = ApplicationSetup.MAX_BLOCKS;
  }
}

    .split("\\s*,\\s*");
TermPipeline next = getEndOfPipeline();
final TermPipeline last = next;
TermPipeline tmp;

public static Index makeIndexFields(String[] docnos, String[] documents, Indexer indexer, String path, String prefix) throws Exception
{
  assertFalse("Index at "+ path + "," +  prefix + " already exists!", 
    IndexOnDisk.existsIndex(path, prefix));
  assertEquals(docnos.length, documents.length);
  Document[] sourceDocs = new Document[docnos.length];
  for(int i=0;i<docnos.length;i++)
  {
    Map<String,String> docProperties = new HashMap<String,String>();
    docProperties.put("filename", docnos[i]);
    docProperties.put("docno", docnos[i]);
    sourceDocs[i] = new TaggedDocument(new ByteArrayInputStream(documents[i].getBytes()), docProperties, new EnglishTokeniser());
  }
  Collection col = new CollectionDocumentList(sourceDocs, "filename");
  indexer.index(new Collection[]{col});		
  Index index = Index.createIndex(path, prefix);
  assertNotNull(index);
  assertEquals(sourceDocs.length, index.getCollectionStatistics().getNumberOfDocuments());
  return index;
}

indexer.createDirectIndex(new Collection[]{col});
indexer.createInvertedIndex();

public static Index makeIndex(String[] docnos, String[] documents, Indexer indexer, String path, String prefix) throws Exception
{
  assertFalse("Index at "+ path + "," +  prefix + " already exists!", 
      IndexOnDisk.existsIndex(path, prefix));
  assertEquals(docnos.length, documents.length);
  Document[] sourceDocs = new Document[docnos.length];
  for(int i=0;i<docnos.length;i++)
  {
    Map<String,String> docProperties = new HashMap<String,String>();
    docProperties.put("filename", docnos[i]);
    docProperties.put("docno", docnos[i]);
    sourceDocs[i] = makeDocumentFromText(documents[i], docProperties);
  }
  Collection col = makeCollection(docnos, documents);
  indexer.index(new Collection[]{col});		
  Index index = Index.createIndex(path, prefix);
  assertNotNull(index);
  assertEquals(sourceDocs.length, index.getCollectionStatistics().getNumberOfDocuments());
  return index;
}

/** Merge a series of numbered indices in the same path/prefix area. New merged index
 * will be stored at mpath/mprefix_highest+1.
 * @param mpath Path of all indices
 * @param mprefix Common prefix of all indices
 * @param lowest lowest subfix of prefix
 * @param highest highest subfix of prefix 
 */
public static void merge(String mpath, String mprefix, int lowest, int highest)
{
  //we define the counterMerged in order to
  //ensure that the merged data structures will
  //have different names
  LinkedList<String[]> llist = new LinkedList<String[]>();
  for (int i=lowest; i<=highest; i++) {
      llist.add(new String[]{mpath,mprefix+ "_" + i});
  }
  merge(mpath, mprefix, llist, highest+1);
}

public static Index makeIndex(String[] docnos, String[] documents, Indexer indexer) throws Exception
{
  assertEquals(docnos.length, documents.length);
  Document[] sourceDocs = new Document[docnos.length];
  for(int i=0;i<docnos.length;i++)
  {
    Map<String,String> docProperties = new HashMap<String,String>();
    docProperties.put("filename", docnos[i]);
    sourceDocs[i] = new FileDocument(new ByteArrayInputStream(documents[i].getBytes()), docProperties, new EnglishTokeniser());
  }
  Collection col = new CollectionDocumentList(sourceDocs, "filename");
  indexer.index(new Collection[]{col});		
  Index index = Index.createIndex();
  assertEquals(sourceDocs.length, index.getCollectionStatistics().getNumberOfDocuments());
  return index;
}

Javadoc

Properties:

termpipelines - the sequence of TermPipeline stages (e.g. Stopwords removal and PorterStemmer).
termpipelines.skip - a list of tokens which should not be skipped from the term pipeline. If not set or empty, then none will be skipped.
indexing.max.tokens - The maximum number of tokens the indexer will attempt to index in a document. If 0, then all tokens will be indexed (default).
ignore.empty.documents - Assign empty documents with docids. Default true
indexing.max.docs.per.builder - Maximum number of documents in an index before a new index is created, and merged later.
indexing.builder.boundary.docnos - Docnos of documents that force the index being created to be completed, and a new index to be commenced. An alternative to indexing.max.docs.per.builder
indexer.meta.forward.keys - comma delimited list of Document properties to index as document metadata in the MetaIndex. Defaults to "docno", which permits docid->docno lookups.. Examples are "docno,url" or "docno,url,content"
indexer.meta.forward.keylens - comma delimited list of the length of the values to record in the MetaIndex. Defaults to 20.
indexer.meta.reverse.keys - comma delimited list of Document properties to permit lookups for (i.e. docno->docid). Defaults to empty (none are enabled).

Most used methods

index
Creates the data structures for a set of collections. It creates a set of data structures for everyi
createDirectIndex
An abstract method for creating the direct index, the document index and the lexicon for the given c
createInvertedIndex
An abstract method for creating the inverted index, given that the the direct index, the document in
getEndOfPipeline
An abstract method that returns the last component of the term pipeline.
load_builder_boundary_documents
Loads the builder boundary documents from the property indexing.builder.boundary.docnos, comma deli
load_field_ids
loads a mapping of field name -> field id
load_indexer_properties
load_pipeline
Creates the term pipeline, as specified by the property termpipelines in the properties file. The de
merge
Merge a series of indices, in pair-wise fashion
mergeTwoIndices
Merge two indices.
parseInts

parseInts

Popular in Java

Creating JSON documents from java classes using gson
putExtra (Intent)
runOnUiThread (Activity)
addToBackStack (FragmentTransaction)
Date (java.util)
A specific moment in time, with millisecond precision. Values typically come from System#currentTime
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
Reference (javax.naming)
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Top plugins for Android Studio

How to useIndexer in org.terrier.structures.indexing

Best Java code snippets using org.terrier.structures.indexing.Indexer (Showing top 12 results out of 315)

How to use
Indexer
in
org.terrier.structures.indexing