org.terrier.utility.TagSet java code examples

/**
 * TConstructs an instance of the TRECFullTokenizer.
 * The used tags are TagSet.TREC_DOC_TAGS and
 * TagSet.TREC_EXACT_DOC_TAGS
 */
public TRECFullTokenizer() {
  inTagToProcess = false;
  inTagToSkip = false;
  inDocnoTag = false;
  tagSet = new TagSet(TagSet.TREC_DOC_TAGS);
  exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
  EOD = false;
  EOF = false;
}
/**

/**
 * Returns true if the given tag is to be processed.
 * @return true if the tag is to be processed, otherwise false.
 */
public boolean inTagToProcess() {
  return (!stk.isEmpty() && tagSet.isTagToProcess(stk.peek()));
}
/**

/**
 * Returns true if the given tag is to be skipped.
 * @return true if the tag is to be skipped, otherwise false.
 */
public boolean inTagToSkip() {
  return (!stk.isEmpty() && tagSet.isTagToSkip(stk.peek()));
}
/**

    if (tag_open) {
      if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) {
        stk.push(tagName.toUpperCase());
        if (tagSet.isTagToProcess(tagName)) {
          inTagToProcess = true;
          inTagToSkip = false;
      if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) {
        processEndOfTag(tagName.toUpperCase());
        String stackTop = null;
        if (!stk.isEmpty()) {
          stackTop = stk.peek();
          if (tagSet.isTagToProcess(stackTop)) {
            inTagToProcess = true;
            inTagToSkip = false;
  EOD = true;	
boolean hasWhitelist = tagSet.hasWhitelist();
if (!btag && 
    (!hasWhitelist || (hasWhitelist && inTagToProcess )) && 
    !inTagToSkip) 
  if (!stk.empty() && tagSet.isIdTag(stk.peek()))
    return s;
  if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek()))
    return lowercase ? s.toLowerCase() : s;

        final boolean tagToProcess = _tags.isTagToProcess(tagName);
        if (tagToProcess || _tags.isTagToSkip(tagName)) {
          stk.push(upperCaseTagName);
          if (tagToProcess) {
        if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
          htmlStk.add(upperCaseTagName);
          inHtmlTagToProcess = true;
        final boolean tagToProcess = _tags.isTagToProcess(tagName);
        if (tagToProcess || _tags.isTagToSkip(tagName)) {
          processEndOfTag(upperCaseTagName);
          String stackTop = null;
          if (!stk.isEmpty()) {
            stackTop = stk.peek();
            if (_tags.isTagToProcess(stackTop)) {
              inTagToProcess = true;
              inTagToSkip = false;
        if (_fields.isTagToProcess(tagName)) {
          htmlStk.remove(upperCaseTagName);
boolean hasWhitelist = _tags.hasWhitelist();
if (!btag && 
    (!hasWhitelist || (hasWhitelist && inTagToProcess )) && 
    !inTagToSkip) 
  if (!stk.empty() && _exact.isTagToProcess(stk.peek()))
    return lowercase ? s.toLowerCase() : s;

TagSet tagSet = new TagSet(TagSet);
tags_CaseSensitive = tagSet.isCaseSensitive();
docnotag = tagSet.getDocTag();
String tmpDocTag = "<" + tagSet.getDocTag() + ">";
String tmpEndDocTag = "</" + tagSet.getDocTag() + ">";
String tmpDocnoTag = "<" + tagSet.getIdTag() + ">";
String tmpEndDocnoTag = "</" + tagSet.getIdTag() + ">";
start_docTag = tmpDocTag.toCharArray();
start_docTagLength = start_docTag.length;

@Test public void testSimple() {
  ApplicationSetup.setProperty("TrecDocTags.process", "TEXT");
  TagSet t = new TagSet(TagSet.TREC_DOC_TAGS);
  assertTrue(t.isTagToProcess("text"));
  assertFalse(t.isTagToProcess("abstract"));
}

TagSet htmlTags = new TagSet(TagSet.FIELD_TAGS);
String toProcess = htmlTags.getTagsToProcess();
FIELD_NAMES = toProcess.equals("") ? new String[0] : toProcess.split("\\s*,\\s*");
FIELDS_COUNT = FIELD_NAMES.length;

    if (tag_open) {
      if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) {
        stk.push(tagName.toUpperCase());
        if (tagSet.isTagToProcess(tagName)) {
          inTagToProcess = true;
          inTagToSkip = false;
      if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) {
        processEndOfTag(tagName.toUpperCase());
        String stackTop = null;
        if (!stk.isEmpty()) {
          stackTop = stk.peek();
          if (tagSet.isTagToProcess(stackTop)) {
            inTagToProcess = true;
            inTagToSkip = false;
  EOD = true;	
boolean hasWhitelist = tagSet.hasWhitelist();
if (!btag && 
    (!hasWhitelist || (hasWhitelist && inTagToProcess )) && 
    !inTagToSkip) 
  if (!stk.empty() && tagSet.isIdTag(stk.peek()))
    return s;
  if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek()))
    return lowercase ? s.toLowerCase() : s;

        final boolean tagToProcess = _tags.isTagToProcess(tagName);
        if (tagToProcess || _tags.isTagToSkip(tagName)) {
          stk.push(upperCaseTagName);
          if (tagToProcess) {
        if (_fields.isTagToProcess(tagName) && !tagName.equals("")) {
          htmlStk.add(upperCaseTagName);
          inHtmlTagToProcess = true;
        final boolean tagToProcess = _tags.isTagToProcess(tagName);
        if (tagToProcess || _tags.isTagToSkip(tagName)) {
          processEndOfTag(upperCaseTagName);
          String stackTop = null;
          if (!stk.isEmpty()) {
            stackTop = stk.peek();
            if (_tags.isTagToProcess(stackTop)) {
              inTagToProcess = true;
              inTagToSkip = false;
        if (_fields.isTagToProcess(tagName)) {
          htmlStk.remove(upperCaseTagName);
boolean hasWhitelist = _tags.hasWhitelist();
if (!btag && 
    (!hasWhitelist || (hasWhitelist && inTagToProcess )) && 
    !inTagToSkip) 
  if (!stk.empty() && _exact.isTagToProcess(stk.peek()))
    return lowercase ? s.toLowerCase() : s;

TagSet htmlTags = new TagSet(TagSet.FIELD_TAGS);
String toProcess = htmlTags.getTagsToProcess();
FIELD_NAMES = toProcess.equals("") ? new String[0] : toProcess.split("\\s*,\\s*");
FIELDS_COUNT = FIELD_NAMES.length;

/**
 * TConstructs an instance of the TRECFullTokenizer.
 * The used tags are TagSet.TREC_DOC_TAGS and
 * TagSet.TREC_EXACT_DOC_TAGS
 */
public TRECFullTokenizer() {
  inTagToProcess = false;
  inTagToSkip = false;
  inDocnoTag = false;
  tagSet = new TagSet(TagSet.TREC_DOC_TAGS);
  exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
  EOD = false;
  EOF = false;
}
/**

/**
 * Returns true if the given tag is to be processed.
 * @return true if the tag is to be processed, otherwise false.
 */
public boolean inTagToProcess() {
  return (!stk.isEmpty() && tagSet.isTagToProcess(stk.peek()));
}
/**

/**
 * Returns true if the given tag is to be skipped.
 * @return true if the tag is to be skipped, otherwise false.
 */
public boolean inTagToSkip() {
  return (!stk.isEmpty() && tagSet.isTagToSkip(stk.peek()));
}
/**

/**
 * Constructs an instance of the TRECFullTokenizer, 
 * given the buffered reader.
 * The used tags are TagSet.TREC_DOC_TAGS and
 * TagSet.TREC_EXACT_DOC_TAGS
 * @param _br java.io.BufferedReader the input stream to tokenize
 */
public TRECFullTokenizer(BufferedReader _br) {
  inTagToProcess = false;
  inTagToSkip = false;
  inDocnoTag = false;
  this.br = _br;
  tagSet = new TagSet(TagSet.TREC_DOC_TAGS);
  exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
  EOD = false;
  EOF = false;
}
/**

if (length == 0 || length > tokenMaximumLength)
  return null;
if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek()))
  return s;
final StringReader sr = new StringReader(s);

/**
 * Constructs an instance of the TRECFullTokenizer, 
 * given the buffered reader.
 * The used tags are TagSet.TREC_DOC_TAGS and
 * TagSet.TREC_EXACT_DOC_TAGS
 * @param _br java.io.BufferedReader the input stream to tokenize
 */
public TRECFullTokenizer(BufferedReader _br) {
  inTagToProcess = false;
  inTagToSkip = false;
  inDocnoTag = false;
  this.br = _br;
  tagSet = new TagSet(TagSet.TREC_DOC_TAGS);
  exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
  EOD = false;
  EOF = false;
}
/**

if (length == 0 || length > tokenMaximumLength)
  return null;
if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek()))
  return s;
final StringReader sr = new StringReader(s);

this._tags = new TagSet(TagSet.TREC_DOC_TAGS);
this._exact = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
this._fields = new TagSet(TagSet.FIELD_TAGS);
this.tokeniser = _tokeniser;
this.currentTokenStream = Tokeniser.EMPTY_STREAM;

if (doctags!=null) this._tags = new TagSet(doctags);
else this._tags = new TagSet(TagSet.TREC_DOC_TAGS);
if (exactdoctags!=null) this._exact = new TagSet(exactdoctags);
else this._exact = new TagSet(TagSet.TREC_EXACT_DOC_TAGS);
if (fieldtags!=null) this._fields = new TagSet(fieldtags);
else  this._fields = new TagSet(TagSet.FIELD_TAGS);
this.tokeniser = _tokeniser;
this.currentTokenStream = Tokeniser.EMPTY_STREAM;

Javadoc

A class that models a set of tags to process (white list), a set of tags to skip (black list), a tag that is used as a document delimiter, and a tag the contents of which are used as a unique identifier. The text within any tag encountered within the scope of a tag from the white list, is processed by default, unless it is explicitly black listed.
For example, in order to index all the text within the DOC tag of a document from a typical TREC collection, without indexing the contents of the DOCHDR tag, we could define in the properties file the following properties:
TrecDocTags.doctag=DOC
TrecDocTags.idtag=DOCNO
TrecDocTags.process=
TrecDocTags.skip=DOCHDR
TrecDocTags.casesensitive=false

In the source code, we would create an instance of the class as follows:
TagSet TrecIndexToProcess = new TagSet("TrecDocTags");
All the tags are converted to uppercase, in order to check whether they belong to the specified set of tags.

Most used methods

<init>
Constructs the tag set for the given prefix, by reading the corresponding properties from the proper
isTagToProcess
Checks whether the tag should be processed.
hasWhitelist
Returns true if whiteListSize > 0.
isTagToSkip
Checks whether a tag should be skipped. You should use isTagToProcess as it checks the whitelist and
getDocTag
Return the document delimiter tag.
getIdTag
Return the id tag.
getTagsToProcess
Returns a comma separated list of tags to process
isCaseSensitive
Returns true if this tag set has been specified as case-sensitive
isIdTag
Checks whether the given tag is a unique identifier tag, that is the document number of a document,

Popular in Java

Finding current android device location
notifyDataSetChanged (ArrayAdapter)
getContentResolver (Context)
compareTo (BigDecimal)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
URLConnection (java.net)
A connection to a URL for reading or writing. For HTTP connections, see HttpURLConnection for docume
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
Top 12 Jupyter Notebook extensions

How to useTagSet in org.terrier.utility

Best Java code snippets using org.terrier.utility.TagSet (Showing top 20 results out of 315)

How to use
TagSet
in
org.terrier.utility