int ch = this.read(); Token tok; switch (ch) { case T_CARET: return this.processCaret(); case T_DOLLAR: return this.processDollar(); case T_LOOKAHEAD: return this.processLookahead(); case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); case T_LOOKBEHIND: return this.processLookbehind(); case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); this.next(); return Token.createEmpty(); case 'A': return this.processBacksolidus_A(); case 'Z': return this.processBacksolidus_Z(); case 'z': return this.processBacksolidus_z(); case 'b': return this.processBacksolidus_b(); case 'B': return this.processBacksolidus_B(); case '<': return this.processBacksolidus_lt(); case '>': return this.processBacksolidus_gt(); tok = this.parseAtom(); ch = this.read(); switch (ch) { case T_STAR: return this.processStar(tok); case T_PLUS: return this.processPlus(tok); case T_QUESTION: return this.processQuestion(tok); case T_CHAR: if (this.chardata == '{' && this.offset < this.regexlen) {
ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { this.offset++; case '[': ret = T_LBRACKET; break; case '^': if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { ret = T_CHAR; if (this.isSet(RegularExpression.XMLSCHEMA_MODE)) { ret = T_CHAR; break; if (++this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-1); ch = this.regex.charAt(this.offset++); switch (ch) { case '<': if (this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-3); ch = this.regex.charAt(this.offset++); if (ch == '=') {
this.setContext(S_INBRACKETS); this.next(); // '[' boolean nrange = false; RangeToken base = null; RangeToken tok; if (this.read() == T_CHAR && this.chardata == '^') { nrange = true; this.next(); // '^' if (useNrange) { tok = Token.createNRange(); while ((type = this.read()) != T_EOF) { if (type == T_CHAR && this.chardata == ']' && !firstloop) break; case 'w': case 'W': case 's': case 'S': tok.mergeRanges(this.getTokenForShorthand(c)); end = true; break; c = this.processCIinCharacterClass(tok, c); if (c < 0) end = true; break; case 'P': int pstart = this.offset; RangeToken tok2 = this.processBacksolidus_pP(c); if (tok2 == null) throw this.ex("parser.atom.5", pstart);
ret = T_BACKSOLIDUS; if (this.offset >= this.regexlen) throw ex("parser.next.1", this.offset-1); this.chardata = this.regex.charAt(this.offset++); break; if (this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { this.offset++; if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { this.offset++; break; if (++this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-1); ch = this.regex.charAt(this.offset++); switch (ch) { case '<': if (this.offset >= this.regexlen) throw ex("parser.next.2", this.offset-3); ch = this.regex.charAt(this.offset++); if (ch == '=') { ret = T_NEGATIVELOOKBEHIND; } else throw ex("parser.next.3", this.offset-3);
this.setContext(S_INBRACKETS); this.next(); // '[' RangeToken base = null; RangeToken tok; if (this.read() == T_CHAR && this.chardata == '^') { this.next(); // '^' if (useNrange) { tok = Token.createNRange(); while ((type = this.read()) != T_EOF) { if (type == T_CHAR && this.chardata == ']' && !firstloop) break; case 'w': case 'W': case 's': case 'S': tok.mergeRanges(this.getTokenForShorthand(c)); end = true; break; c = this.processCIinCharacterClass(tok, c); if (c < 0) end = true; break; case 'P': int pstart = this.offset; RangeToken tok2 = this.processBacksolidus_pP(c); if (tok2 == null) throw this.ex("parser.atom.5", pstart); tok.mergeRanges(tok2); end = true;
if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); int c = this.chardata; switch (c) { this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if (this.chardata == '{') { int v1 = 0; int uv = 0; do { this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if ((v1 = hexChar(this.chardata)) < 0) break; if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); uv = uv*16+v1; } while (true); if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); c = uv; } else { int v1 = 0; if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); int uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1;
int ch = this.read(); Token tok; switch (ch) { case T_CARET: return this.processCaret(); case T_DOLLAR: return this.processDollar(); case T_LOOKAHEAD: return this.processLookahead(); case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); case T_LOOKBEHIND: return this.processLookbehind(); case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); this.next(); return Token.createEmpty(); case 'A': return this.processBacksolidus_A(); case 'Z': return this.processBacksolidus_Z(); case 'z': return this.processBacksolidus_z(); case 'b': return this.processBacksolidus_b(); case 'B': return this.processBacksolidus_B(); case '<': return this.processBacksolidus_lt(); case '>': return this.processBacksolidus_gt(); tok = this.parseAtom(); ch = this.read(); switch (ch) { case T_STAR: return this.processStar(tok); case T_PLUS: return this.processPlus(tok); case T_QUESTION: return this.processQuestion(tok); case T_CHAR: if (this.chardata == '{' && this.offset < this.regexlen) {
int ch = this.read(); Token tok = null; switch (ch) { case T_LPAREN: return this.processParen(); case T_LPAREN2: return this.processParen2(); // '(?:' case T_CONDITION: return this.processCondition(); // '(?(' case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) case T_INDEPENDENT: return this.processIndependent(); case T_DOT: this.next(); // Skips '.' tok = Token.token_dot; break; case T_LBRACKET: return this.parseCharacterClass(true); case T_SET_OPERATIONS: return this.parseSetOperations(); case 'w': case 'W': case 's': case 'S': tok = this.getTokenForShorthand(this.chardata); this.next(); return tok; int ch2 = this.decodeEscaped(); if (ch2 < 0x10000) { tok = Token.createChar(ch2); case 'c': return this.processBacksolidus_c(); case 'C': return this.processBacksolidus_C(); case 'i': return this.processBacksolidus_i();
if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); int c = this.chardata; switch (c) { this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if (this.chardata == '{') { int v1 = 0; int uv = 0; do { this.next(); if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); if ((v1 = hexChar(this.chardata)) < 0) break; if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); uv = uv*16+v1; } while (true); if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); c = uv; } else { int v1 = 0; if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); int uv = v1; this.next(); if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) throw ex("parser.descape.1", this.offset-1); uv = uv*16+v1;
int ch = this.read(); Token tok = null; switch (ch) { case T_LPAREN: return this.processParen(); case T_LPAREN2: return this.processParen2(); // '(?:' case T_CONDITION: return this.processCondition(); // '(?(' case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) case T_INDEPENDENT: return this.processIndependent(); case T_DOT: this.next(); // Skips '.' tok = Token.token_dot; break; case T_LBRACKET: return this.parseCharacterClass(true); case T_SET_OPERATIONS: return this.parseSetOperations(); case 'w': case 'W': case 's': case 'S': tok = this.getTokenForShorthand(this.chardata); this.next(); return tok; int ch2 = this.decodeEscaped(); if (ch2 < 0x10000) { tok = Token.createChar(ch2); case 'c': return this.processBacksolidus_c(); case 'C': return this.processBacksolidus_C(); case 'i': return this.processBacksolidus_i();
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); throw ex("parser.parse.2", this.offset); this.references.addElement(new ReferencePosition(finalRefno, this.offset)); this.offset ++; if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); this.offset ++; } else { if (ch == '?') this.offset --; // Points '('. this.next(); condition = this.parseFactor(); switch (condition.type) { case Token.LOOKAHEAD: break; case Token.ANCHOR: if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); break; default: throw ex("parser.factor.5", this.offset); this.next(); Token yesPattern = this.parseRegex(); Token noPattern = null; if (yesPattern.type == Token.UNION) { if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); noPattern = yesPattern.getChild(1); yesPattern = yesPattern.getChild(0);
if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); this.references.addElement(new ReferencePosition(refno, this.offset)); this.offset ++; if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); this.offset ++; } else { if (ch == '?') this.offset --; // Points '('. this.next(); condition = this.parseFactor(); switch (condition.type) { case Token.LOOKAHEAD: break; case Token.ANCHOR: if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); break; default: throw ex("parser.factor.5", this.offset); this.next(); Token yesPattern = this.parseRegex(); Token noPattern = null; if (yesPattern.type == Token.UNION) { if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); noPattern = yesPattern.getChild(1); yesPattern = yesPattern.getChild(0); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
this.offset ++; if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); if (ch == '-') { this.offset ++; this.offset ++; if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); } else if (ch == ')') { // such as (?-i) this.offset ++; this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); } else throw ex("parser.factor.3", this.offset);
this.offset ++; if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); if (ch == '-') { this.offset ++; this.offset ++; if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); this.next(); } else if (ch == ')') { // such as (?-i) this.offset ++; this.next(); tok = Token.createModifierGroup(this.parseRegex(), add, mask); } else throw ex("parser.factor.3", this.offset);
switch (ch) { case 'd': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", true) : Token.token_0to9; break; case 'D': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", false) : Token.token_not_0to9; break; case 'w': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", true) : Token.token_wordchars; break; case 'W': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", false) : Token.token_not_wordchars; break; case 's': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", true) : Token.token_spaces; break; case 'S': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", false) : Token.token_not_spaces; break;
switch (ch) { case 'd': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", true) : Token.token_0to9; break; case 'D': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("Nd", false) : Token.token_not_0to9; break; case 'w': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", true) : Token.token_wordchars; break; case 'W': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsWord", false) : Token.token_not_wordchars; break; case 's': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", true) : Token.token_spaces; break; case 'S': tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) ? Token.getRange("IsSpace", false) : Token.token_not_spaces; break;
/** * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' */ protected RangeToken parseSetOperations() throws ParseException { RangeToken tok = this.parseCharacterClass(false); int type; while ((type = this.read()) != T_RPAREN) { int ch = this.chardata; if (type == T_CHAR && (ch == '-' || ch == '&') || type == T_PLUS) { this.next(); if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); RangeToken t2 = this.parseCharacterClass(false); if (type == T_PLUS) tok.mergeRanges(t2); else if (ch == '-') tok.subtractRanges(t2); else if (ch == '&') tok.intersectRanges(t2); else throw new RuntimeException("ASSERT"); } else { throw ex("parser.ope.2", this.offset-1); } } this.next(); return tok; }
Token processBackreference() throws ParseException { int refnum = this.chardata-'0'; int finalRefnum = refnum; if (this.parennumber <= refnum) throw ex("parser.parse.2", this.offset-2); while (this.offset < this.regexlen) { final int ch = this.regex.charAt(this.offset); if ('0' <= ch && ch <= '9') { refnum = (refnum * 10) + (ch - '0'); if (refnum < this.parennumber) { ++this.offset; finalRefnum = refnum; this.chardata = ch; } else { break; } } else { break; } } Token tok = Token.createBackReference(finalRefnum); this.hasBackReferences = true; if (this.references == null) this.references = new Vector(); this.references.addElement(new ReferencePosition(finalRefnum, this.offset-2)); this.next(); return tok; }
/** * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' */ protected RangeToken parseSetOperations() throws ParseException { RangeToken tok = this.parseCharacterClass(false); int type; while ((type = this.read()) != T_RPAREN) { int ch = this.chardata; if (type == T_CHAR && (ch == '-' || ch == '&') || type == T_PLUS) { this.next(); if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); RangeToken t2 = this.parseCharacterClass(false); if (type == T_PLUS) tok.mergeRanges(t2); else if (ch == '-') tok.subtractRanges(t2); else if (ch == '&') tok.intersectRanges(t2); else throw new RuntimeException("ASSERT"); } else { throw ex("parser.ope.2", this.offset-1); } } this.next(); return tok; }
/** * regex ::= term (`|` term)* * term ::= factor+ * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' * | atom (('*' | '+' | '?' | minmax ) '?'? )?) * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block */ Token parseRegex() throws ParseException { Token tok = this.parseTerm(); Token parent = null; while (this.read() == T_OR) { this.next(); // '|' if (parent == null) { parent = Token.createUnion(); parent.addChild(tok); tok = parent; } tok.addChild(this.parseTerm()); } return tok; }