DecomposedCharset decompose() { CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges); if (!ieExplicits.isEmpty()) { if (negRanges.intersection(ieExplicits).isEmpty()) { return decompose(ranges, false); } else if (ranges.intersection(ieExplicits).isEmpty()) { return decompose(negRanges, true); } } DecomposedCharset positive = decompose(ranges, false); DecomposedCharset negative = decompose(negRanges, true); return positive.complexity() <= negative.complexity() ? positive : negative; }
DecomposedCharset decompose() { CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges); if (!ieExplicits.isEmpty()) { if (negRanges.intersection(ieExplicits).isEmpty()) { return decompose(ranges, false); } else if (ranges.intersection(ieExplicits).isEmpty()) { return decompose(negRanges, true); } } DecomposedCharset positive = decompose(ranges, false); DecomposedCharset negative = decompose(negRanges, true); return positive.complexity() <= negative.complexity() ? positive : negative; }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes all * the code-units in the input and those that are case-insensitively * equivalent to a code-unit in the input. */ public static CharRanges expandToAllMatched(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } return ranges.union(expanded); }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes all * the code-units in the input and those that are case-insensitively * equivalent to a code-unit in the input. */ public static CharRanges expandToAllMatched(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } return ranges.union(expanded); }
@Override public RegExpTree simplify(String flags) { if (ranges.isEmpty()) { return NEVER_MATCHES;
@Override public RegExpTree simplify(String flags) { if (ranges.isEmpty()) { return NEVER_MATCHES;
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes * the minimal set of code units such that for every code unit in the * input there is a case-sensitively equivalent canonical code unit in the * output. */ public static CharRanges reduceToMinimum(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : CANON_DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } // Letters a-z gzip better than uppercase A-Z since JavaScript keywords // are lower-case, so, even though the definition of Canonicalize is // based on String.prototype.toUpperCase, we use lowercase ASCII characters // in the minimal form. expanded = expanded.difference(UCASE_ASCII_LETTERS).union( expanded.intersection(UCASE_ASCII_LETTERS).shift(32)); return ranges.difference(caseSensitive).union(expanded); }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes * the minimal set of code units such that for every code unit in the * input there is a case-sensitively equivalent canonical code unit in the * output. */ public static CharRanges reduceToMinimum(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : CANON_DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } // Letters a-z gzip better than uppercase A-Z since JavaScript keywords // are lower-case, so, even though the definition of Canonicalize is // based on String.prototype.toUpperCase, we use lowercase ASCII characters // in the minimal form. expanded = expanded.difference(UCASE_ASCII_LETTERS).union( expanded.intersection(UCASE_ASCII_LETTERS).shift(32)); return ranges.difference(caseSensitive).union(expanded); }