/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes all * the code-units in the input and those that are case-insensitively * equivalent to a code-unit in the input. */ public static CharRanges expandToAllMatched(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } return ranges.union(expanded); }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes all * the code-units in the input and those that are case-insensitively * equivalent to a code-unit in the input. */ public static CharRanges expandToAllMatched(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } return ranges.union(expanded); }
private DecomposedCharset decompose(CharRanges ranges, boolean inverted) { StringBuilder namedGroups = new StringBuilder(); CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits); while (true) { char groupName = 0; CharRanges simplest = null; int minComplexity = DecomposedCharset.complexity(ranges); for (Map.Entry<Character, CharRanges> namedGroup : NAMED_CHAR_GROUPS.entrySet()) { CharRanges group = namedGroup.getValue(); if (ranges.containsAll(group)) { CharRanges withoutGroup = ranges.difference(group).union( rangesInterIeExplicits); int complexity = DecomposedCharset.complexity(withoutGroup); if (complexity < minComplexity) { simplest = withoutGroup; groupName = namedGroup.getKey().charValue(); minComplexity = complexity; } } } if (simplest != null) { namedGroups.append('\\').append(groupName); ranges = simplest; } else { break; } } return new DecomposedCharset(inverted, ranges, namedGroups.toString()); }
private DecomposedCharset decompose(CharRanges ranges, boolean inverted) { StringBuilder namedGroups = new StringBuilder(); CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits); while (true) { char groupName = 0; CharRanges simplest = null; int minComplexity = DecomposedCharset.complexity(ranges); for (Map.Entry<Character, CharRanges> namedGroup : NAMED_CHAR_GROUPS.entrySet()) { CharRanges group = namedGroup.getValue(); if (ranges.containsAll(group)) { CharRanges withoutGroup = ranges.difference(group).union( rangesInterIeExplicits); int complexity = DecomposedCharset.complexity(withoutGroup); if (complexity < minComplexity) { simplest = withoutGroup; groupName = namedGroup.getKey().charValue(); minComplexity = complexity; } } } if (simplest != null) { namedGroups.append('\\').append(groupName); ranges = simplest; } else { break; } } return new DecomposedCharset(inverted, ranges, namedGroups.toString()); }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes * the minimal set of code units such that for every code unit in the * input there is a case-sensitively equivalent canonical code unit in the * output. */ public static CharRanges reduceToMinimum(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : CANON_DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } // Letters a-z gzip better than uppercase A-Z since JavaScript keywords // are lower-case, so, even though the definition of Canonicalize is // based on String.prototype.toUpperCase, we use lowercase ASCII characters // in the minimal form. expanded = expanded.difference(UCASE_ASCII_LETTERS).union( expanded.intersection(UCASE_ASCII_LETTERS).shift(32)); return ranges.difference(caseSensitive).union(expanded); }
/** * Given a character range that may include case sensitive code-units, * such as {@code [0-9B-M]}, returns the character range that includes * the minimal set of code units such that for every code unit in the * input there is a case-sensitively equivalent canonical code unit in the * output. */ public static CharRanges reduceToMinimum(CharRanges ranges) { CharRanges caseSensitive = ranges.intersection(CASE_SENSITIVE); if (caseSensitive.isEmpty()) { return ranges; } CharRanges expanded = CharRanges.EMPTY; for (DeltaSet ds : CANON_DELTA_SETS) { expanded = expanded.union( caseSensitive.intersection(ds.codeUnits).shift(-ds.delta)); } // Letters a-z gzip better than uppercase A-Z since JavaScript keywords // are lower-case, so, even though the definition of Canonicalize is // based on String.prototype.toUpperCase, we use lowercase ASCII characters // in the minimal form. expanded = expanded.difference(UCASE_ASCII_LETTERS).union( expanded.intersection(UCASE_ASCII_LETTERS).shift(32)); return ranges.difference(caseSensitive).union(expanded); }
DecomposedCharset decompose() { CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges); if (!ieExplicits.isEmpty()) { if (negRanges.intersection(ieExplicits).isEmpty()) { return decompose(ranges, false); } else if (ranges.intersection(ieExplicits).isEmpty()) { return decompose(negRanges, true); } } DecomposedCharset positive = decompose(ranges, false); DecomposedCharset negative = decompose(negRanges, true); return positive.complexity() <= negative.complexity() ? positive : negative; }
DecomposedCharset decompose() { CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges); if (!ieExplicits.isEmpty()) { if (negRanges.intersection(ieExplicits).isEmpty()) { return decompose(ranges, false); } else if (ranges.intersection(ieExplicits).isEmpty()) { return decompose(negRanges, true); } } DecomposedCharset positive = decompose(ranges, false); DecomposedCharset negative = decompose(negRanges, true); return positive.complexity() <= negative.complexity() ? positive : negative; }
options.add(CaseCanonicalize.reduceToMinimum(ranges)); CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS); CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS);
options.add(CaseCanonicalize.reduceToMinimum(ranges)); CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS); CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS);