public static CharRanges inclusive(int start, int end) { if (start > end) { throw new IndexOutOfBoundsException(start + " > " + end); } return new CharRanges(new int[] { start, end + 1 }); }
/** * Returns the case canonical version of the given code-unit. ECMAScript 5 * explicitly says that code-units are to be treated as their code-point * equivalent, even surrogates. */ public static char caseCanonicalize(char ch) { if (ch < 0x80) { // Normal case. return ('A' <= ch && ch <= 'Z') ? (char) (ch | 32) : ch; } // Non-ASCII case. if (CASE_SENSITIVE.contains(ch)) { for (DeltaSet ds : CANON_DELTA_SETS) { if (ds.codeUnits.contains(ch)) { return (char) (ch - ds.delta); } } } return ch; }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes * the minimal set of code units such that for every code unit in the * input there is a case-sensitively equivalent canonical code unit in the * output. */ public static CharRanges reduceToMinimum(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : CANON_DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } // Letters a-z gzip better than uppercase A-Z since JavaScript keywords // are lower-case, so, even though the definition of Canonicalize is // based on String.prototype.toUpperCase, we use lowercase ASCII characters // in the minimal form. expanded = expanded.difference(UCASE_ASCII_LETTERS).union( expanded.intersection(UCASE_ASCII_LETTERS).shift(32)); return ranges.difference(caseSensitive).union(expanded); }
/** * Returns an instance containing all and only the given members. */ public static CharRanges withMembers(int... members) { return new CharRanges(intArrayToRanges(members)); }
static int complexity(CharRanges ranges) { int complexity = 0; for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) { int start = ranges.start(i); int end = ranges.end(i) - 1; if (start < 0x20 || start >= 0x7f) { complexity += start >= 0x100 ? 6 : 4; } else { ++complexity; } switch (end - start) { case 0: continue; case 1: break; default: complexity += 1; } if (end < 0x20 || end >= 0x7f) { complexity += end >= 0x100 ? 6 : 4; } else { ++complexity; } } return complexity; }
@Override public RegExpTree simplify(String flags) { if (ranges.isEmpty()) { return NEVER_MATCHES; options.add(CaseCanonicalize.reduceToMinimum(ranges)); CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS); CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS); CharRanges lcaseLettersToUpper = lcaseLetters.shift(-32); CharRanges ucaseLettersToLower = ucaseLetters.shift(32); options.add(ranges.union(ucaseLettersToLower)); options.add(ranges.union(lcaseLettersToUpper)); options.add(ranges.union(lcaseLettersToUpper) .union(ucaseLettersToLower)); options.add(ranges.union(ucaseLettersToLower).difference(ucaseLetters)); options.add(ranges.union(lcaseLettersToUpper).difference(lcaseLetters)); if (best.getNumRanges() == 1 && best.end(0) - best.start(0) == 1) { return new Text(Character.toString((char) best.start(0))); if (!best.equals(ranges)) { return new Charset(best, ieExplicits);
private DecomposedCharset decompose(CharRanges ranges, boolean inverted) { StringBuilder namedGroups = new StringBuilder(); CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits); while (true) { char groupName = 0; CharRanges simplest = null; int minComplexity = DecomposedCharset.complexity(ranges); for (Map.Entry<Character, CharRanges> namedGroup : NAMED_CHAR_GROUPS.entrySet()) { CharRanges group = namedGroup.getValue(); if (ranges.containsAll(group)) { CharRanges withoutGroup = ranges.difference(group).union( rangesInterIeExplicits); int complexity = DecomposedCharset.complexity(withoutGroup); if (complexity < minComplexity) { simplest = withoutGroup; groupName = namedGroup.getKey().charValue(); minComplexity = complexity; } } } if (simplest != null) { namedGroups.append('\\').append(groupName); ranges = simplest; } else { break; } } return new DecomposedCharset(inverted, ranges, namedGroups.toString()); }
void appendSourceCode(StringBuilder sb) { if (ranges.isEmpty()) { if (!inverted && namedGroups.length() == 2) { sb.append(namedGroups); return; } else if (ranges.isEmpty() && namedGroups.isEmpty()) { sb.append(inverted ? "[\\S\\s]" : "(?!)"); return; boolean rangesStartCharset = !inverted && namedGroups.isEmpty(); boolean emitDashAtEnd = false; for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) { char start = (char) ranges.start(i); char end = (char) (ranges.end(i) - 1); switch (end - start) { case 0:
char ch = ((Text) charAlternative).text.charAt(0); members[memberIdx++] = ch; if (IE_SPEC_ERRORS.contains(ch)) { ieExplicits = ieExplicits.union(CharRanges.inclusive(ch, ch)); chars = chars.union(cs.ranges); ieExplicits = ieExplicits.union(cs.ieExplicits); chars = chars.union(CharRanges.withMembers(members)); charAlternatives.clear(); charAlternatives.add(
DecomposedCharset decompose() { CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges); if (!ieExplicits.isEmpty()) { if (negRanges.intersection(ieExplicits).isEmpty()) { return decompose(ranges, false); } else if (ranges.intersection(ieExplicits).isEmpty()) { return decompose(negRanges, true); } } DecomposedCharset positive = decompose(ranges, false); DecomposedCharset negative = decompose(negRanges, true); return positive.complexity() <= negative.complexity() ? positive : negative; }
@Override public boolean equals(Object o) { return o instanceof Charset && ranges.equals(((Charset) o).ranges); }
private static int complexityWordFoldedHelper(CharRanges ranges) { int complexity = DecomposedCharset.complexity(ranges); if (ranges.containsAll(WORD_CHARS)) { complexity = Math.min( complexity, 1 + DecomposedCharset.complexity(ranges.difference(WORD_CHARS))); } if (ranges.containsAll(INVERSE_WORD_CHARS)) { complexity = Math.min( complexity, 1 + DecomposedCharset.complexity( ranges.difference(INVERSE_WORD_CHARS))); } return complexity; }
@Override public int hashCode() { return ranges.hashCode() ^ 0xdede2246; } }
private static int complexityWordFolded(CharRanges ranges) { return Math.min( complexityWordFoldedHelper(ranges), 1 + complexityWordFoldedHelper( CharRanges.ALL_CODE_UNITS.difference(ranges))); }
@Override public RegExpTree simplify(String flags) { if (ranges.isEmpty()) { return NEVER_MATCHES; options.add(CaseCanonicalize.reduceToMinimum(ranges)); CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS); CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS); CharRanges lcaseLettersToUpper = lcaseLetters.shift(-32); CharRanges ucaseLettersToLower = ucaseLetters.shift(32); options.add(ranges.union(ucaseLettersToLower)); options.add(ranges.union(lcaseLettersToUpper)); options.add(ranges.union(lcaseLettersToUpper) .union(ucaseLettersToLower)); options.add(ranges.union(ucaseLettersToLower).difference(ucaseLetters)); options.add(ranges.union(lcaseLettersToUpper).difference(lcaseLetters)); if (best.getNumRanges() == 1 && best.end(0) - best.start(0) == 1) { return new Text(Character.toString((char) best.start(0))); if (!best.equals(ranges)) { return new Charset(best, ieExplicits);
private DecomposedCharset decompose(CharRanges ranges, boolean inverted) { StringBuilder namedGroups = new StringBuilder(); CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits); while (true) { char groupName = 0; CharRanges simplest = null; int minComplexity = DecomposedCharset.complexity(ranges); for (Map.Entry<Character, CharRanges> namedGroup : NAMED_CHAR_GROUPS.entrySet()) { CharRanges group = namedGroup.getValue(); if (ranges.containsAll(group)) { CharRanges withoutGroup = ranges.difference(group).union( rangesInterIeExplicits); int complexity = DecomposedCharset.complexity(withoutGroup); if (complexity < minComplexity) { simplest = withoutGroup; groupName = namedGroup.getKey().charValue(); minComplexity = complexity; } } } if (simplest != null) { namedGroups.append('\\').append(groupName); ranges = simplest; } else { break; } } return new DecomposedCharset(inverted, ranges, namedGroups.toString()); }
void appendSourceCode(StringBuilder sb) { if (ranges.isEmpty()) { if (!inverted && namedGroups.length() == 2) { sb.append(namedGroups); return; } else if (ranges.isEmpty() && namedGroups.isEmpty()) { sb.append(inverted ? "[\\S\\s]" : "(?!)"); return; boolean rangesStartCharset = !inverted && namedGroups.isEmpty(); boolean emitDashAtEnd = false; for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) { char start = (char) ranges.start(i); char end = (char) (ranges.end(i) - 1); switch (end - start) { case 0:
char ch = ((Text) charAlternative).text.charAt(0); members[memberIdx++] = ch; if (IE_SPEC_ERRORS.contains(ch)) { ieExplicits = ieExplicits.union(CharRanges.inclusive(ch, ch)); chars = chars.union(cs.ranges); ieExplicits = ieExplicits.union(cs.ieExplicits); chars = chars.union(CharRanges.withMembers(members)); charAlternatives.clear(); charAlternatives.add(
DecomposedCharset decompose() { CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges); if (!ieExplicits.isEmpty()) { if (negRanges.intersection(ieExplicits).isEmpty()) { return decompose(ranges, false); } else if (ranges.intersection(ieExplicits).isEmpty()) { return decompose(negRanges, true); } } DecomposedCharset positive = decompose(ranges, false); DecomposedCharset negative = decompose(negRanges, true); return positive.complexity() <= negative.complexity() ? positive : negative; }
static int complexity(CharRanges ranges) { int complexity = 0; for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) { int start = ranges.start(i); int end = ranges.end(i) - 1; if (start < 0x20 || start >= 0x7f) { complexity += start >= 0x100 ? 6 : 4; } else { ++complexity; } switch (end - start) { case 0: continue; case 1: break; default: complexity += 1; } if (end < 0x20 || end >= 0x7f) { complexity += end >= 0x100 ? 6 : 4; } else { ++complexity; } } return complexity; }