private Matcher group() { if (peek() == '?') { advance(); char c = advance(); switch (c) { case '<': // Named capturing group advance(v -> v != '>'); if (isAtEnd()) { throw error("unclosed name for capturing group"); } return groupExpr(); case ':': // Non-capturing group return groupExpr(); case '=': return new PositiveLookaheadMatcher(expr()); case '!': return new NegativeLookaheadMatcher(expr()); default: throw unsupported("inline flags"); } } else { return groupExpr(); } }
private Matcher term() { List<Matcher> matchers = new ArrayList<>(); while (!isAtEnd() && peek() != ')' && peek() != '|') { char c = advance(); switch (c) { case '\\': matchers.add(escape()); break; case '^': break; case '[': matchers.add(new CharClassMatcher(charClass())); break; case '(': matchers.add(group()); break; case '{': matchers.add(repeat(pop(matchers))); break; case '.': case '+': if (matchers.isEmpty()) { throw error("dangling modifier"); matchers.add(meta(pop(matchers))); break; default:
AsciiSet set = AsciiSet.none(); boolean rangeStart = false; boolean invert = peek() == '^'; if (invert) { advance(); set = AsciiSet.all(); if (peek() == ']' || peek() == '-') { char c = advance(); set = update(set, AsciiSet.fromPattern(Character.toString(c)), invert); while (!isAtEnd() && peek() != ']') { char c = advance(); switch (c) { case '[': set = set.union(charClass()); rangeStart = false; break; case '&': if (peek() == '&') { advance(); if (peek() == '[') { advance(); set = set.intersection(charClass()); } else if (peek() != ']') { set = set.intersection(charClass()); --current; set = update(set, AsciiSet.fromPattern(Character.toString(c)), invert);
private Matcher escape() { char c = peek(); if (c == 'Q') { return quotation(); } else if (c == 'c') { throw unsupported("control character"); } else if (Constants.DIGIT.contains(c) || c == 'k') { throw unsupported("back references"); } else { AsciiSet set = namedCharClass(); if (set == null) { advance(); return new CharSeqMatcher(String.valueOf(c)); } else { return new CharClassMatcher(set); } } }
@SuppressWarnings("PMD.MissingBreakInSwitch") private AsciiSet namedCharClass() { boolean invert = false; char c = advance(); switch (c) { case 'd': return Constants.DIGIT; case 'D': return Constants.DIGIT.invert(); case 's': return Constants.SPACE; case 'S': return Constants.SPACE.invert(); case 'w': return Constants.WORD_CHARS; case 'W': return Constants.WORD_CHARS.invert(); case 'h': case 'H': throw unsupported("horizontal whitespace class"); case 'v': case 'V': throw unsupported("vertical whitespace class"); case 'P': invert = true; case 'p': return newNamedCharSet(name(), invert); default: --current; return null; } }
/** * Compile a pattern string and return a matcher that can be used to check if string values * match the pattern. Pattern matchers are can be reused many times and are thread safe. */ public static PatternMatcher compile(String pattern) { String p = pattern; boolean ignoreCase = false; if (p.startsWith("(?i)")) { ignoreCase = true; p = pattern.substring(4); } if (p.length() > 0) { p = "^.*(" + p + ").*$"; } Parser parser = new Parser(PatternUtils.expandEscapedChars(p)); Matcher m = Optimizer.optimize(parser.parse()); return ignoreCase ? m.ignoreCase() : m; }
private Matcher meta(Matcher matcher) { int start = current - 1; advance(c -> META.contains((char) c)); --current; String quantifier = tokens.subSequence(start, current).toString(); switch (quantifier) { case "?": // Makes repeat reluctant if (matcher instanceof RepeatMatcher) { return matcher; } case "??": case "?+": return OrMatcher.create(matcher, TrueMatcher.INSTANCE); case "*": case "*?": return new ZeroOrMoreMatcher(matcher, term()); case "*+": return new RepeatMatcher(matcher, 0, Integer.MAX_VALUE); case "+": case "+?": return SeqMatcher.create(matcher, new ZeroOrMoreMatcher(matcher, term())); case "++": return SeqMatcher.create(matcher, new RepeatMatcher(matcher, 1, Integer.MAX_VALUE)); default: throw new IllegalArgumentException("unknown quantifier: " + quantifier); } }
private AsciiSet newNamedCharSet(String name, boolean invert) { AsciiSet set = Constants.NAMED_CHAR_CLASSES.get(name); if (set == null) { throw error("unknown character property name: " + name); } return invert ? set.invert() : set; }
private Matcher repeat(Matcher matcher) { int start = current; advance(c -> c != '}'); String[] numbers = tokens.subSequence(start, current - 1).toString().split(","); int min = Integer.parseInt(numbers[0]); int max = (numbers.length > 1) ? Integer.parseInt(numbers[1]) : min; return new RepeatMatcher(matcher, min, max); }
private Matcher term() { List<Matcher> matchers = new ArrayList<>(); while (!isAtEnd() && peek() != ')' && peek() != '|') { char c = advance(); switch (c) { case '\\': matchers.add(escape()); break; case '^': break; case '[': matchers.add(new CharClassMatcher(charClass())); break; case '(': matchers.add(group()); break; case '{': matchers.add(repeat(pop(matchers))); break; case '.': case '+': if (matchers.isEmpty()) { throw error("dangling modifier"); matchers.add(meta(pop(matchers))); break; default:
AsciiSet set = AsciiSet.none(); boolean rangeStart = false; boolean invert = peek() == '^'; if (invert) { advance(); set = AsciiSet.all(); if (peek() == ']' || peek() == '-') { char c = advance(); set = update(set, AsciiSet.fromPattern(Character.toString(c)), invert); while (!isAtEnd() && peek() != ']') { char c = advance(); switch (c) { case '[': set = set.union(charClass()); rangeStart = false; break; case '&': if (peek() == '&') { advance(); if (peek() == '[') { advance(); set = set.intersection(charClass()); } else if (peek() != ']') { set = set.intersection(charClass()); --current; set = update(set, AsciiSet.fromPattern(Character.toString(c)), invert);
private Matcher escape() { char c = peek(); if (c == 'Q') { return quotation(); } else if (c == 'c') { throw unsupported("control character"); } else if (Constants.DIGIT.contains(c) || c == 'k') { throw unsupported("back references"); } else { AsciiSet set = namedCharClass(); if (set == null) { advance(); return new CharSeqMatcher(String.valueOf(c)); } else { return new CharClassMatcher(set); } } }
@SuppressWarnings("PMD.MissingBreakInSwitch") private AsciiSet namedCharClass() { boolean invert = false; char c = advance(); switch (c) { case 'd': return Constants.DIGIT; case 'D': return Constants.DIGIT.invert(); case 's': return Constants.SPACE; case 'S': return Constants.SPACE.invert(); case 'w': return Constants.WORD_CHARS; case 'W': return Constants.WORD_CHARS.invert(); case 'h': case 'H': throw unsupported("horizontal whitespace class"); case 'v': case 'V': throw unsupported("vertical whitespace class"); case 'P': invert = true; case 'p': return newNamedCharSet(name(), invert); default: --current; return null; } }