org.apache.solr.analysis.HTMLStripCharFilter java code examples

public HTMLStripCharFilter create(CharStream input) {
 return new HTMLStripCharFilter(input);
}

private int readScriptString() throws IOException {
 int quoteChar = next();
 if (quoteChar!='\'' && quoteChar!='"') return MISMATCH;
 while((numRead - lastMark) < safeReadAheadLimit) {
  int ch = next();
  if (ch==quoteChar) return MATCH;
  else if (ch=='\\') {
   ch=next();
  } else if (ch<0) {
   return MISMATCH;
  } else if (ch=='<') {
   eatSSI();
  }
 }
 return MISMATCH;
}

/*** From HTML 4.0
[4]     NameChar     ::=    Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
[5]     Name     ::=    (Letter | '_' | ':') (NameChar)*
[6]     Names    ::=    Name (#x20 Name)*
[7]     Nmtoken    ::=    (NameChar)+
[8]     Nmtokens     ::=    Nmtoken (#x20 Nmtoken)*
***/

 // should I include all id chars allowable by HTML/XML here?
 // including accented chars, ':', etc?
 private boolean isIdChar(int ch) {
  // return Character.isUnicodeIdentifierPart(ch);
  // isUnicodeIdentiferPart doesn't include '-'... shoudl I still
  // use it and add in '-',':',etc?
  return isAlpha(ch) || isDigit(ch) || ch=='.' ||
      ch=='-' || ch=='_' || ch==':'
      || Character.isLetter(ch);

 }

private int nextSkipWS() throws IOException {
 int ch=next();
 while(isSpace(ch)) ch=next();
 return ch;
}

addOffCorrectMap(numReturned, numEaten);
numWhitespace = 0;
int ch = next();
  saveState();
  ch = readEntity();
  if (ch>=0) return ch;
  if (ch==MISMATCH) {
   restoreState();
  saveState();
  ch = next();
  int ret = MISMATCH;
  if (ch=='!') {
   ret = readBang(false);
  } else if (ch=='/') {
   ret = readName(true);
   if (ret==MATCH) {
    ch=nextSkipWS();
    ret= ch=='>' ? MATCH : MISMATCH;
  } else if (isAlpha(ch)) {
   push(ch);
   ret = readTag();
  } else if (ch=='?') {
   ret = readProcessingInstruction();
  restoreState();

int ch = next();
if (!isAlpha(ch)) {
 push(ch);
 return MISMATCH;
 ch = next();
 if (isIdChar(ch)) {
  sb.append((char)ch);
 } else if (ch=='/') {
  return nextSkipWS()=='>' ? MATCH : MISMATCH;
 } else {
  break;
if ( !(ch=='>' || isSpace(ch)) ) {
 return MISMATCH;
  ch=next();
  if (isSpace(ch)) {
   continue;
  } else if (isFirstIdChar(ch)) {
   push(ch);
   int ret = readAttr2();
   if (ret==MISMATCH) return ret;
  } else if (ch=='/') {
   return nextSkipWS()=='>' ? MATCH : MISMATCH;
  } else if (ch=='>') {
   break;

int ch = next();
if (!isFirstIdChar(ch)) return MISMATCH;
ch = next();
while(isIdChar(ch) && ((numRead - lastMark) < safeReadAheadLimit)){
 ch=next();
if (isSpace(ch)) ch = nextSkipWS();
 push(ch);
 return MATCH;
int quoteChar = nextSkipWS();
  ch = next();
  if (ch<0) return MISMATCH;
  else if (ch=='<') {
   eatSSI();
  ch = next();
  if (ch<0) return MISMATCH;
  else if (isSpace(ch)) {
   push(ch);
   return MATCH;
  } else if (ch=='>') {
   push(ch);
   return MATCH;
  } else if (ch=='<') {
   eatSSI();

int ch = next();
int base=10;
boolean invalid=false;
sb.setLength(0);
if (isDigit(ch)) {
  ch = next();
  if (isDigit(ch)) {
   sb.append((char)ch);
  } else {
 sb.setLength(0);
 for (int i=0; i<10; i++) {
  ch = next();
  if (isHex(ch)) {
   sb.append((char)ch);
  } else {
 if (isSpace(ch)) {
  push(ch);
  numWhitespace = sb.length() + eaten;
  return Integer.parseInt(sb.toString(), base);

int ch = next();
 if (ch!='-') {
 push(ch);
 return MISMATCH;
ch = next();
 if (ch!='-') {
 push(ch);
 push('-');
 return MISMATCH;
 ch = next();
 if (ch<0) return MISMATCH;
 if (ch=='-') {
  ch = next();
  if (ch<0) return MISMATCH;
  if (ch!='-') {
   push(ch);
   continue;
  ch = next();
  if (ch<0) return MISMATCH;
  if (ch!='>') {
   push(ch);
   push('-');
   continue;

private int readName(boolean checkEscaped) throws IOException {
 StringBuilder builder = (checkEscaped && escapedTags!=null) ? new StringBuilder() : null;
 int ch = next();
 if (builder!=null) builder.append((char)ch);
 if (!isFirstIdChar(ch)) return MISMATCH;
 ch = next();
 if (builder!=null) builder.append((char)ch);
 while(isIdChar(ch)) {
  ch=next();
  if (builder!=null) builder.append((char)ch);
 }
 if (ch!=-1) {
  push(ch);
 }
 //strip off the trailing >
 if (builder!=null && escapedTags.contains(builder.substring(0, builder.length() - 1))){
  return MISMATCH;
 }
 return MATCH;
}

public static void main(String[] args) throws IOException {
 Reader in = new HTMLStripCharFilter(
     CharReader.get(new InputStreamReader(System.in)));
 int ch;
 while ( (ch=in.read()) != -1 ) System.out.print((char)ch);
}

Javadoc

A CharFilter that wraps another Reader and attempts to strip out HTML constructs.

Most used methods

<init>
addOffCorrectMap
eatSSI
findEndTag
isAlpha
isDigit
isFirstIdChar
isHex
isIdChar
From HTML 4.0 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender [5]
isSpace
next
nextSkipWS

Popular in Java

Creating JSON documents from java classes using gson
getContentResolver (Context)
requestLocationUpdates (LocationManager)
setRequestProperty (URLConnection)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
MalformedURLException (java.net)
This exception is thrown when a program attempts to create an URL from an incorrect specification.
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Kernel (java.awt.image)
From CI to AI: The AI layer in your organization

How to useHTMLStripCharFilter in org.apache.solr.analysis

Best Java code snippets using org.apache.solr.analysis.HTMLStripCharFilter (Showing top 11 results out of 315)

How to use
HTMLStripCharFilter
in
org.apache.solr.analysis