com.ibm.icu.text.UTF16 java code examples

/**
 * Skips over a run of zero or more Pattern_White_Space characters at pos in text.
 */
private static int skipPatternWhiteSpace(String text, int pos) {
  while (pos < text.length()) {
    int c = UTF16.charAt(text, pos);
    if (!PatternProps.isWhiteSpace(c)) {
      break;
    }
    pos += UTF16.getCharCount(c);
  }
  return pos;
}

if (isSurrogate(ch)) {
  if (isLeadSurrogate(ch)) {
    ++offset16;
    if (offset16 < limit && isTrailSurrogate(source[offset16])) {
      return LEAD_SURROGATE_BOUNDARY;
    if (offset16 >= start && isLeadSurrogate(source[offset16])) {
      return TRAIL_SURROGATE_BOUNDARY;

int nextCharLL() {
  int ch;
  if (fNextIndex >= fRB.fRules.length()) {
    return -1;
  }
  ch = UTF16.charAt(fRB.fRules, fNextIndex);
  fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);
  if (ch == '\r' ||
    ch == chNEL ||
    ch == chLS ||
    ch == '\n' && fLastChar != '\r') {
    // Character is starting a new line.  Bump up the line number, and
    //  reset the column to 0.
    fLineNum++;
    fCharNum = 0;
    if (fQuoteMode) {
      error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
      fQuoteMode = false;
    }
  } else {
    // Character is not starting a new line.  Except in the case of a
    //   LF following a CR, increment the column position.
    if (ch != '\n') {
      fCharNum++;
    }
  }
  fLastChar = ch;
  return ch;
}

/**
 * Set a code point into a UTF16 position. Adjusts target according if we are replacing a
 * non-supplementary codepoint with a supplementary and vice versa.
 *
 * @param target Stringbuffer
 * @param offset16 UTF16 position to insert into
 * @param char32 Code point
 * @stable ICU 2.1
 */
public static void setCharAt(StringBuffer target, int offset16, int char32) {
  int count = 1;
  char single = target.charAt(offset16);
  if (isSurrogate(single)) {
    // pairs of the surrogate with offset16 at the lead char found
    if (isLeadSurrogate(single) && (target.length() > offset16 + 1)
        && isTrailSurrogate(target.charAt(offset16 + 1))) {
      count++;
    } else {
      // pairs of the surrogate with offset16 at the trail char
      // found
      if (isTrailSurrogate(single) && (offset16 > 0)
          && isLeadSurrogate(target.charAt(offset16 - 1))) {
        offset16--;
        count++;
      }
    }
  }
  target.replace(offset16, offset16 + count, valueOf(char32));
}

int cp;
main:
  for (int i = start; i < limit; i += UTF16.getCharCount(cp)) {
    cp = UTF16.charAt(pattern, i);
      default:
        if (usingSlash) {
          UTF16.append(buffer, cp);
          quoteStatus = NONE;
          continue main;
      if (hexCount == 0) {
        quoteStatus = NONE;
        UTF16.append(buffer, hexValue);
        UTF16.append(buffer, cp);
        quoteStatus = NORMAL_QUOTE;
        continue main;
        UTF16.append(buffer, cp);
      UTF16.append(buffer, cp);
      quoteStatus = NORMAL_QUOTE;
      continue main;
      UTF16.append(buffer, cp);
      continue main;
        UTF16.append(buffer, cp);

@Override
public int char32At(int pos) {
 return UTF16.charAt(buffer, 0, length, pos);
}

/**
 * Performs character mirroring without reordering. When this method is
 * called, <code>{@link #text}</code> should be in a Logical form.
 */
private void mirror() {
  if ((reorderingOptions & Bidi.DO_MIRRORING) == 0) {
    return;
  }
  StringBuffer sb = new StringBuffer(text);
  byte[] levels = bidi.getLevels();
  for (int i = 0, n = levels.length; i < n;) {
    int ch = UTF16.charAt(sb, i);
    if ((levels[i] & 1) != 0) {
      UTF16.setCharAt(sb, i, UCharacter.getMirror(ch));
    }
    i += UTF16.getCharCount(ch);
  }
  text = sb.toString();
  reorderingOptions &= ~Bidi.DO_MIRRORING;
}

if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) {
  output.add(source);
  return;
for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) {
  cp = UTF16.charAt(source, i);
    + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute);
  String chStr = UTF16.valueOf(source, i);
  for (String s : subpermute) {
    String piece = chStr + s;

if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
  + ", " + Utility.hex(segment.substring(segmentPos)));
  decomp = UTF16.valueOf(comp);
int cp;
int decompPos = 0;
int decompCp = UTF16.charAt(decomp,0);
decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char
for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) {
  cp = UTF16.charAt(segment, i);
  if (cp == decompCp) { // if equal, eat another cp from decomp
    if (PROGRESS) System.out.println("  matches: " + Utility.hex(UTF16.valueOf(cp)));
    if (decompPos == decomp.length()) { // done, have all decomp characters!
      buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars
      ok = true;
      break;
    decompCp = UTF16.charAt(decomp, decompPos);
    decompPos += UTF16.getCharCount(decompCp);
    if (PROGRESS) System.out.println("  buffer: " + Utility.hex(UTF16.valueOf(cp)));
    UTF16.append(buf, cp);
if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;

if (isLeadSurrogate(ch) && ((result + 1) < limit)
    && isTrailSurrogate(source[result + 1])) {
  result++;

/*public*/int fromUCountPending() {
  if (preFromULength > 0) {
    return UTF16.getCharCount(preFromUFirstCP) + preFromULength;
  } else if (preFromULength < 0) {
    return -preFromULength;
  } else if (fromUChar32 > 0) {
    return 1;
  } else if (preFromUFirstCP > 0) {
    return UTF16.getCharCount(preFromUFirstCP);
  }
  return 0;
}

} else if (!UTF16.isSurrogate((char) c)) {
} else if (UTF16.isLeadSurrogate((char) c)) {
length = UTF16.getCharCount(c);

protected final CoderResult encodeMalformedOrUnmappable(CharBuffer source, int ch, boolean flush) {
  /*
   * if the character is a lead surrogate, we need to call encodeTrail to attempt to match
   * it up with a trail surrogate. if not, the character is unmappable.
   */
  return (UTF16.isSurrogate((char) ch))
      ? encodeTrail(source, (char) ch, flush)
      : CoderResult.unmappableForLength(1);
}

      String str = UTF16.valueOf(c);
      text.replace(openPos, cursor, str);
    UTF16.append(name, c);
cursor += UTF16.getCharCount(c);

private int getTrail(CharBuffer source, ByteBuffer target, IntBuffer offsets){
  if(source.hasRemaining()){
    /*test the following code unit*/
    char trail = source.get(source.position());
    if(UTF16.isTrailSurrogate(trail)){
      source.position(source.position()+1);
      ++nextSourceIndex;
      c=UCharacter.getCodePoint((char)c, trail);
    }
  } else {
    /*no more input*/
    c = -c; /*negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else*/
    checkNegative = true;
  }
  LoopAfterTrail = true;
  return regularLoop;
}

private CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) {
  CoderResult cr = CoderResult.UNDERFLOW;
  int tBeginIndex = target.position();
  if (target.hasRemaining()) {
    if (c <= 0xffff) {
      target.put((char) c);
      c = UConverterConstants.U_SENTINEL;
    } else /* c is a supplementary code point */{
      target.put(UTF16.getLeadSurrogate(c));
      c = UTF16.getTrailSurrogate(c);
      if (target.hasRemaining()) {
        target.put((char) c);
        c = UConverterConstants.U_SENTINEL;
      }
    }
    /* write offsets */
    if (offsets != null) {
      offsets.put(sourceIndex);
      if ((tBeginIndex + 1) < target.position()) {
        offsets.put(sourceIndex);
      }
    }
  }
  /* write overflow from c */
  if (c >= 0) {
    charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c);
    cr = CoderResult.OVERFLOW;
  }
  return cr;
}

  private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
    int sourceIndex = source.position() - 1;
    CoderResult cr;
    if (UTF16.isSurrogate(ch)) {
      cr = handleSurrogates(source, ch);
      if (cr != null)
        return cr;
      char trail = UTF16.getTrailSurrogate(fromUChar32);
      fromUChar32 = 0;
      // 4 bytes
      temp[0 ^ endianXOR] = (byte) (ch >>> 8);
      temp[1 ^ endianXOR] = (byte) (ch);
      temp[2 ^ endianXOR] = (byte) (trail >>> 8);
      temp[3 ^ endianXOR] = (byte) (trail);
      cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
    } else {
      // 2 bytes
      temp[0 ^ endianXOR] = (byte) (ch >>> 8);
      temp[1 ^ endianXOR] = (byte) (ch);
      cr = fromUWriteBytes(this, temp, 0, 2, target, offsets, sourceIndex);
    }
    return (cr.isUnderflow() ? null : cr);
  }
}

boolean doread = true;
if (c != 0 && target.hasRemaining()) {
  if (UTF16.isLeadSurrogate((char) c)) {
    SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex);
    doloop = getTrailDouble(source, target, uniMask, x, flush, cr);
        c = source.get(sourceArrayIndex++);
        ++nextSourceIndex;
        if (UTF16.isSurrogate((char) c)) {
          if (UTF16.isLeadSurrogate((char) c)) {

/**
 * Cover JDK 1.5 APIs. Append the code point to the buffer and return the buffer as a
 * convenience.
 *
 * @param target The buffer to append to
 * @param cp The code point to append
 * @return the updated StringBuffer
 * @throws IllegalArgumentException If cp is not a valid code point
 * @stable ICU 3.0
 */
public static StringBuffer appendCodePoint(StringBuffer target, int cp) {
  return append(target, cp);
}

@Override
int shiftCodePointOffsetBy0(String text, int offset, int shift) {
  return UTF16.moveCodePointOffset(text, offset, shift);
}

Javadoc

Standalone utility class providing UTF16 character conversions and indexing conversions.

Code that uses strings alone rarely need modification. By design, UTF-16 does not allow overlap, so searching for strings is a safe operation. Similarly, concatenation is always safe. Substringing is safe if the start and end are both on UTF-32 boundaries. In normal code, the values for start and end are on those boundaries, since they arose from operations like searching. If not, the nearest UTF-32 boundaries can be determined using bounds().

Examples:

The following examples illustrate use of some of these methods.

 
// iteration forwards: Original 
for (int i = 0; i < s.length(); ++i) { 
char ch = s.charAt(i); 
doSomethingWith(ch); 
} 
// iteration forwards: Changes for UTF-32 
int ch; 
for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) { 
ch = UTF16.charAt(s, i); 
doSomethingWith(ch); 
} 
// iteration backwards: Original 
for (int i = s.length() - 1; i >= 0; --i) { 
char ch = s.charAt(i); 
doSomethingWith(ch); 
} 
// iteration backwards: Changes for UTF-32 
int ch; 
for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) { 
ch = UTF16.charAt(s, i); 
doSomethingWith(ch); 
}

Notes:

Naming: For clarity, High and Low surrogates are called Lead and Trail in the API, which gives a better sense of their ordering in a string. offset16 and offset32 are used to distinguish offsets to UTF-16 boundaries vs offsets to UTF-32 boundaries. int char32 is used to contain UTF-32 characters, as opposed to char16, which is a UTF-16 code unit.
Roundtripping Offsets: You can always roundtrip from a UTF-32 offset to a UTF-16 offset and back. Because of the difference in structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and back if and only if bounds(string, offset16) != TRAIL.
Exceptions: The error checking will throw an exception if indices are out of bounds. Other than than that, all methods will behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32 values are present. UCharacter.isLegal() can be used to check for validity if desired.
Unmatched Surrogates: If the string contains unmatched surrogates, then these are counted as one UTF-32 value. This matches their iteration behavior, which is vital. It also matches common display practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
Optimization: The method implementations may need optimization if the compiler doesn't fold static final methods. Since surrogate pairs will form an exceeding small percentage of all the text in the world, the singleton case should always be optimized for.

Most used methods

charAt
Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards (with UTF1
getCharCount
Determines how many chars this char32 requires. If a validity check is required, use isLegal() [../l
isSurrogate
Determines whether the code value is a surrogate.
isTrailSurrogate
Determines whether the character is a trail surrogate.
moveCodePointOffset
Shifts offset16 by the argument number of codepoints within a subarray.
setCharAt
Set a code point into a UTF16 position in a char array. Adjusts target according if we are replacing
append
Adds a codepoint to offset16 position of the argument char array.
countCodePoint
Number of codepoints in a UTF16 char array substring
getLeadSurrogate
Returns the lead surrogate. If a validity check is required, useisLegal() [../lang/UCharacter.html#i
getTrailSurrogate
Returns the trail surrogate. If a validity check is required, useisLegal() [../lang/UCharacter.html#
isLeadSurrogate
Determines whether the character is a lead surrogate.
_charAt

Popular in Java

Creating JSON documents from java classes using gson
setRequestProperty (URLConnection)
addToBackStack (FragmentTransaction)
getResourceAsStream (ClassLoader)
PrintStream (java.io)
Fake signature of an existing Java class.
Selector (java.nio.channels)
A controller for the selection of SelectableChannel objects. Selectable channels can be registered w
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
DateTimeFormat (org.joda.time.format)
Factory that creates instances of DateTimeFormatter from patterns and styles. Datetime formatting i
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Top Vim plugins

How to useUTF16 in com.ibm.icu.text

Best Java code snippets using com.ibm.icu.text.UTF16 (Showing top 20 results out of 315)

How to use
UTF16
in
com.ibm.icu.text