/** * TConstructs an instance of the TRECFullTokenizer. * The used tags are TagSet.TREC_DOC_TAGS and * TagSet.TREC_EXACT_DOC_TAGS */ public TRECFullTokenizer() { inTagToProcess = false; inTagToSkip = false; inDocnoTag = false; tagSet = new TagSet(TagSet.TREC_DOC_TAGS); exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS); EOD = false; EOF = false; } /**
/** * Returns true if the given tag is to be processed. * @return true if the tag is to be processed, otherwise false. */ public boolean inTagToProcess() { return (!stk.isEmpty() && tagSet.isTagToProcess(stk.peek())); } /**
/** * Returns true if the given tag is to be skipped. * @return true if the tag is to be skipped, otherwise false. */ public boolean inTagToSkip() { return (!stk.isEmpty() && tagSet.isTagToSkip(stk.peek())); } /**
if (tag_open) { if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) { stk.push(tagName.toUpperCase()); if (tagSet.isTagToProcess(tagName)) { inTagToProcess = true; inTagToSkip = false; if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) { processEndOfTag(tagName.toUpperCase()); String stackTop = null; if (!stk.isEmpty()) { stackTop = stk.peek(); if (tagSet.isTagToProcess(stackTop)) { inTagToProcess = true; inTagToSkip = false; EOD = true; boolean hasWhitelist = tagSet.hasWhitelist(); if (!btag && (!hasWhitelist || (hasWhitelist && inTagToProcess )) && !inTagToSkip) if (!stk.empty() && tagSet.isIdTag(stk.peek())) return s; if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek())) return lowercase ? s.toLowerCase() : s;
final boolean tagToProcess = _tags.isTagToProcess(tagName); if (tagToProcess || _tags.isTagToSkip(tagName)) { stk.push(upperCaseTagName); if (tagToProcess) { if (_fields.isTagToProcess(tagName) && !tagName.equals("")) { htmlStk.add(upperCaseTagName); inHtmlTagToProcess = true; final boolean tagToProcess = _tags.isTagToProcess(tagName); if (tagToProcess || _tags.isTagToSkip(tagName)) { processEndOfTag(upperCaseTagName); String stackTop = null; if (!stk.isEmpty()) { stackTop = stk.peek(); if (_tags.isTagToProcess(stackTop)) { inTagToProcess = true; inTagToSkip = false; if (_fields.isTagToProcess(tagName)) { htmlStk.remove(upperCaseTagName); boolean hasWhitelist = _tags.hasWhitelist(); if (!btag && (!hasWhitelist || (hasWhitelist && inTagToProcess )) && !inTagToSkip) if (!stk.empty() && _exact.isTagToProcess(stk.peek())) return lowercase ? s.toLowerCase() : s;
TagSet tagSet = new TagSet(TagSet); tags_CaseSensitive = tagSet.isCaseSensitive(); docnotag = tagSet.getDocTag(); String tmpDocTag = "<" + tagSet.getDocTag() + ">"; String tmpEndDocTag = "</" + tagSet.getDocTag() + ">"; String tmpDocnoTag = "<" + tagSet.getIdTag() + ">"; String tmpEndDocnoTag = "</" + tagSet.getIdTag() + ">"; start_docTag = tmpDocTag.toCharArray(); start_docTagLength = start_docTag.length;
@Test public void testSimple() { ApplicationSetup.setProperty("TrecDocTags.process", "TEXT"); TagSet t = new TagSet(TagSet.TREC_DOC_TAGS); assertTrue(t.isTagToProcess("text")); assertFalse(t.isTagToProcess("abstract")); }
TagSet htmlTags = new TagSet(TagSet.FIELD_TAGS); String toProcess = htmlTags.getTagsToProcess(); FIELD_NAMES = toProcess.equals("") ? new String[0] : toProcess.split("\\s*,\\s*"); FIELDS_COUNT = FIELD_NAMES.length;
if (tag_open) { if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) { stk.push(tagName.toUpperCase()); if (tagSet.isTagToProcess(tagName)) { inTagToProcess = true; inTagToSkip = false; if ((tagSet.isTagToProcess(tagName) || tagSet.isTagToSkip(tagName)) && !tagName.equals("")) { processEndOfTag(tagName.toUpperCase()); String stackTop = null; if (!stk.isEmpty()) { stackTop = stk.peek(); if (tagSet.isTagToProcess(stackTop)) { inTagToProcess = true; inTagToSkip = false; EOD = true; boolean hasWhitelist = tagSet.hasWhitelist(); if (!btag && (!hasWhitelist || (hasWhitelist && inTagToProcess )) && !inTagToSkip) if (!stk.empty() && tagSet.isIdTag(stk.peek())) return s; if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek())) return lowercase ? s.toLowerCase() : s;
final boolean tagToProcess = _tags.isTagToProcess(tagName); if (tagToProcess || _tags.isTagToSkip(tagName)) { stk.push(upperCaseTagName); if (tagToProcess) { if (_fields.isTagToProcess(tagName) && !tagName.equals("")) { htmlStk.add(upperCaseTagName); inHtmlTagToProcess = true; final boolean tagToProcess = _tags.isTagToProcess(tagName); if (tagToProcess || _tags.isTagToSkip(tagName)) { processEndOfTag(upperCaseTagName); String stackTop = null; if (!stk.isEmpty()) { stackTop = stk.peek(); if (_tags.isTagToProcess(stackTop)) { inTagToProcess = true; inTagToSkip = false; if (_fields.isTagToProcess(tagName)) { htmlStk.remove(upperCaseTagName); boolean hasWhitelist = _tags.hasWhitelist(); if (!btag && (!hasWhitelist || (hasWhitelist && inTagToProcess )) && !inTagToSkip) if (!stk.empty() && _exact.isTagToProcess(stk.peek())) return lowercase ? s.toLowerCase() : s;
TagSet htmlTags = new TagSet(TagSet.FIELD_TAGS); String toProcess = htmlTags.getTagsToProcess(); FIELD_NAMES = toProcess.equals("") ? new String[0] : toProcess.split("\\s*,\\s*"); FIELDS_COUNT = FIELD_NAMES.length;
/** * TConstructs an instance of the TRECFullTokenizer. * The used tags are TagSet.TREC_DOC_TAGS and * TagSet.TREC_EXACT_DOC_TAGS */ public TRECFullTokenizer() { inTagToProcess = false; inTagToSkip = false; inDocnoTag = false; tagSet = new TagSet(TagSet.TREC_DOC_TAGS); exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS); EOD = false; EOF = false; } /**
/** * Returns true if the given tag is to be processed. * @return true if the tag is to be processed, otherwise false. */ public boolean inTagToProcess() { return (!stk.isEmpty() && tagSet.isTagToProcess(stk.peek())); } /**
/** * Returns true if the given tag is to be skipped. * @return true if the tag is to be skipped, otherwise false. */ public boolean inTagToSkip() { return (!stk.isEmpty() && tagSet.isTagToSkip(stk.peek())); } /**
/** * Constructs an instance of the TRECFullTokenizer, * given the buffered reader. * The used tags are TagSet.TREC_DOC_TAGS and * TagSet.TREC_EXACT_DOC_TAGS * @param _br java.io.BufferedReader the input stream to tokenize */ public TRECFullTokenizer(BufferedReader _br) { inTagToProcess = false; inTagToSkip = false; inDocnoTag = false; this.br = _br; tagSet = new TagSet(TagSet.TREC_DOC_TAGS); exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS); EOD = false; EOF = false; } /**
if (length == 0 || length > tokenMaximumLength) return null; if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek())) return s; final StringReader sr = new StringReader(s);
/** * Constructs an instance of the TRECFullTokenizer, * given the buffered reader. * The used tags are TagSet.TREC_DOC_TAGS and * TagSet.TREC_EXACT_DOC_TAGS * @param _br java.io.BufferedReader the input stream to tokenize */ public TRECFullTokenizer(BufferedReader _br) { inTagToProcess = false; inTagToSkip = false; inDocnoTag = false; this.br = _br; tagSet = new TagSet(TagSet.TREC_DOC_TAGS); exactTagSet = new TagSet(TagSet.TREC_EXACT_DOC_TAGS); EOD = false; EOF = false; } /**
if (length == 0 || length > tokenMaximumLength) return null; if (!stk.empty() && exactTagSet.isTagToProcess(stk.peek())) return s; final StringReader sr = new StringReader(s);
if (doctags!=null) this._tags = new TagSet(doctags); else this._tags = new TagSet(TagSet.TREC_DOC_TAGS); if (exactdoctags!=null) this._exact = new TagSet(exactdoctags); else this._exact = new TagSet(TagSet.TREC_EXACT_DOC_TAGS); if (fieldtags!=null) this._fields = new TagSet(fieldtags); else this._fields = new TagSet(TagSet.FIELD_TAGS); this.tokeniser = _tokeniser; this.currentTokenStream = Tokeniser.EMPTY_STREAM;