public HTMLStripCharFilter create(CharStream input) { return new HTMLStripCharFilter(input); }
private int readScriptString() throws IOException { int quoteChar = next(); if (quoteChar!='\'' && quoteChar!='"') return MISMATCH; while((numRead - lastMark) < safeReadAheadLimit) { int ch = next(); if (ch==quoteChar) return MATCH; else if (ch=='\\') { ch=next(); } else if (ch<0) { return MISMATCH; } else if (ch=='<') { eatSSI(); } } return MISMATCH; }
/*** From HTML 4.0 [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender [5] Name ::= (Letter | '_' | ':') (NameChar)* [6] Names ::= Name (#x20 Name)* [7] Nmtoken ::= (NameChar)+ [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)* ***/ // should I include all id chars allowable by HTML/XML here? // including accented chars, ':', etc? private boolean isIdChar(int ch) { // return Character.isUnicodeIdentifierPart(ch); // isUnicodeIdentiferPart doesn't include '-'... shoudl I still // use it and add in '-',':',etc? return isAlpha(ch) || isDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':' || Character.isLetter(ch); }
addOffCorrectMap(numReturned, numEaten); numWhitespace = 0; int ch = next(); saveState(); ch = readEntity(); if (ch>=0) return ch; if (ch==MISMATCH) { restoreState(); saveState(); ch = next(); int ret = MISMATCH; if (ch=='!') { ret = readBang(false); } else if (ch=='/') { ret = readName(true); if (ret==MATCH) { ch=nextSkipWS(); ret= ch=='>' ? MATCH : MISMATCH; } else if (isAlpha(ch)) { push(ch); ret = readTag(); } else if (ch=='?') { ret = readProcessingInstruction(); restoreState();
int ch = next(); if (!isAlpha(ch)) { push(ch); return MISMATCH; ch = next(); if (isIdChar(ch)) { sb.append((char)ch); } else if (ch=='/') { return nextSkipWS()=='>' ? MATCH : MISMATCH; } else { break; if ( !(ch=='>' || isSpace(ch)) ) { return MISMATCH; ch=next(); if (isSpace(ch)) { continue; } else if (isFirstIdChar(ch)) { push(ch); int ret = readAttr2(); if (ret==MISMATCH) return ret; } else if (ch=='/') { return nextSkipWS()=='>' ? MATCH : MISMATCH; } else if (ch=='>') { break;
int ch = next(); if (!isFirstIdChar(ch)) return MISMATCH; ch = next(); while(isIdChar(ch) && ((numRead - lastMark) < safeReadAheadLimit)){ ch=next(); if (isSpace(ch)) ch = nextSkipWS(); push(ch); return MATCH; int quoteChar = nextSkipWS(); ch = next(); if (ch<0) return MISMATCH; else if (ch=='<') { eatSSI(); ch = next(); if (ch<0) return MISMATCH; else if (isSpace(ch)) { push(ch); return MATCH; } else if (ch=='>') { push(ch); return MATCH; } else if (ch=='<') { eatSSI();
int ch = next(); int base=10; boolean invalid=false; sb.setLength(0); if (isDigit(ch)) { ch = next(); if (isDigit(ch)) { sb.append((char)ch); } else { sb.setLength(0); for (int i=0; i<10; i++) { ch = next(); if (isHex(ch)) { sb.append((char)ch); } else { if (isSpace(ch)) { push(ch); numWhitespace = sb.length() + eaten; return Integer.parseInt(sb.toString(), base);
int ch = next(); if (ch!='-') { push(ch); return MISMATCH; ch = next(); if (ch!='-') { push(ch); push('-'); return MISMATCH; ch = next(); if (ch<0) return MISMATCH; if (ch=='-') { ch = next(); if (ch<0) return MISMATCH; if (ch!='-') { push(ch); continue; ch = next(); if (ch<0) return MISMATCH; if (ch!='>') { push(ch); push('-'); continue;
private int readName(boolean checkEscaped) throws IOException { StringBuilder builder = (checkEscaped && escapedTags!=null) ? new StringBuilder() : null; int ch = next(); if (builder!=null) builder.append((char)ch); if (!isFirstIdChar(ch)) return MISMATCH; ch = next(); if (builder!=null) builder.append((char)ch); while(isIdChar(ch)) { ch=next(); if (builder!=null) builder.append((char)ch); } if (ch!=-1) { push(ch); } //strip off the trailing > if (builder!=null && escapedTags.contains(builder.substring(0, builder.length() - 1))){ return MISMATCH; } return MATCH; }
public static void main(String[] args) throws IOException { Reader in = new HTMLStripCharFilter( CharReader.get(new InputStreamReader(System.in))); int ch; while ( (ch=in.read()) != -1 ) System.out.print((char)ch); }