org.apache.hyracks.util.string.UTF8StringUtil.charSize java code examples

public int charSize(int offset) {
  return UTF8StringUtil.charSize(bytes, start + offset);
}

@Override
public void next() throws HyracksDataException {
  pos += UTF8StringUtil.charSize(data, pos);
}

/**
 * Writes a supplementary char consisting of high and low surrogates
 *
 * @return The length of the surrogates
 * @throws IOException
 */
private static int writeSupplementaryChar(OutputStream os, byte[] src, int limit, int highSurrogatePos,
    char highSurrogate, int highSurrogateSize) throws IOException {
  final int lowSurrogatePos = highSurrogatePos + highSurrogateSize;
  if (lowSurrogatePos >= limit) {
    throw new IllegalStateException("malformed utf8 input");
  }
  final char lowSurrogate = UTF8StringUtil.charAt(src, lowSurrogatePos);
  final int lowSurrogateSize = UTF8StringUtil.charSize(src, lowSurrogatePos);
  os.write(new String(new char[] { highSurrogate, lowSurrogate }).getBytes());
  return highSurrogateSize + lowSurrogateSize;
}

private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte,
    int coefficient, int r) {
  int h = 0;
  int c = 0;
  while (c < length) {
    char ch;
    if (useRawByte) {
      ch = (char) bytes[start + c];
    } else {
      ch = charAt(bytes, start + c);
      if (useLowerCase) {
        ch = Character.toLowerCase(ch);
      }
    }
    h = (coefficient * h + ch) % r;
    c += charSize(bytes, start + c);
  }
  return h;
}

@Override
public void reset(byte[] sentenceData, int start, int length) {
  super.reset(sentenceData, start, length);
  gramNum = 0;
  int numChars = 0;
  int pos = byteIndex;
  int end = pos + sentenceUtf8Length;
  while (pos < end) {
    numChars++;
    pos += UTF8StringUtil.charSize(sentenceData, pos);
  }
  if (usePrePost) {
    totalGrams = numChars + gramLength - 1;
  } else {
    if (numChars >= gramLength) {
      totalGrams = numChars - gramLength + 1;
    } else {
      totalGrams = 0;
    }
  }
}

public int getLowerCaseUTF8Len(int limit) {
  int lowerCaseUTF8Len = 0;
  int pos = startOffset;
  for (int i = 0; i < limit; i++) {
    char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
    lowerCaseUTF8Len += UTF8StringUtil.getModifiedUTF8Len(c);
    pos += UTF8StringUtil.charSize(data, pos);
  }
  return lowerCaseUTF8Len;
}

private static int compareTo(byte[] thisBytes, int thisActualStart, int thisLength, byte[] thatBytes,
    int thatActualStart, int thatLength, boolean useLowerCase, boolean useRawByte) {
  int c1 = 0;
  int c2 = 0;
  while (c1 < thisLength && c2 < thatLength) {
    char ch1, ch2;
    if (useRawByte) {
      ch1 = (char) thisBytes[thisActualStart + c1];
      ch2 = (char) thatBytes[thatActualStart + c2];
    } else {
      ch1 = charAt(thisBytes, thisActualStart + c1);
      ch2 = charAt(thatBytes, thatActualStart + c2);
      if (useLowerCase) {
        ch1 = Character.toLowerCase(ch1);
        ch2 = Character.toLowerCase(ch2);
      }
    }
    if (ch1 != ch2) {
      return ch1 - ch2;
    }
    c1 += charSize(thisBytes, thisActualStart + c1);
    c2 += charSize(thatBytes, thatActualStart + c2);
  }
  return thisLength - thatLength;
}

@Override
public void reset(byte[] data, int startOffset, int endOffset, int tokenLength, int tokenCount) {
  super.reset(data, startOffset, endOffset, tokenLength, tokenCount);
  // pre-compute hash value using JAQL-like string hashing
  int pos = startOffset;
  hash = GOLDEN_RATIO_32;
  for (int i = 0; i < tokenLength; i++) {
    hash ^= Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
    hash *= GOLDEN_RATIO_32;
    pos += UTF8StringUtil.charSize(data, pos);
  }
  hash += tokenCount;
}

public static int getStringLength(byte[] b, int s) {
  int len = getUTFLength(b, s);
  int pos = s + getNumBytesToStoreLength(len);
  int end = pos + len;
  int charCount = 0;
  while (pos < end) {
    charCount++;
    pos += charSize(b, pos);
  }
  return charCount;
}

@Override
public boolean hasNext() {
  // skip delimiters
  while (byteIndex < sentenceEndOffset && isSeparator(UTF8StringUtil.charAt(sentenceBytes, byteIndex))) {
    byteIndex += UTF8StringUtil.charSize(sentenceBytes, byteIndex);
  }
  return byteIndex < sentenceEndOffset;
}

public static void writeUTF8StringAsCSV(byte[] b, int s, int l, OutputStream os) throws IOException {
  int stringLength = UTF8StringUtil.getUTFLength(b, s);
  int position = s + UTF8StringUtil.getNumBytesToStoreLength(stringLength);
  int maxPosition = position + stringLength;
  os.write('"');
  while (position < maxPosition) {
    char c = UTF8StringUtil.charAt(b, position);
    int sz = UTF8StringUtil.charSize(b, position);
    if (c == '"') {
      os.write('"');
    }
    os.write(c);
    position += sz;
  }
  os.write('"');
}

@Override
public short getTokensCount() {
  if (!tokenCountCalculated) {
    tokenCount = 0;
    boolean previousCharIsSeparator = true;
    while (originalIndex < sentenceEndOffset) {
      if (isSeparator(UTF8StringUtil.charAt(sentenceBytes, originalIndex))) {
        previousCharIsSeparator = true;
      } else {
        if (previousCharIsSeparator) {
          tokenCount++;
          previousCharIsSeparator = false;
        }
      }
      originalIndex += UTF8StringUtil.charSize(sentenceBytes, originalIndex);
    }
  }
  return tokenCount;
}

private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException {
  int stringLength = getUTFLength(b, s);
  int position = s + getNumBytesToStoreLength(stringLength);
  int maxPosition = position + stringLength;
  if (useQuotes) {
    os.write('\"');
  }
  while (position < maxPosition) {
    char c = charAt(b, position);
    switch (c) {
      // escape
      case '\\':
      case '"':
        os.write('\\');
        break;
    }
    int sz = charSize(b, position);
    while (sz > 0) {
      os.write(b[position]);
      position++;
      sz--;
    }
  }
  if (useQuotes) {
    os.write('\"');
  }
}

hash ^= Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
hash *= GOLDEN_RATIO_32;
pos += UTF8StringUtil.charSize(data, pos);

/**
 * Compute the normalized key of the UTF8 string.
 * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
 * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars.
 * The comparator will first use this integer to get the result ( <,>, or =), it will check
 * the actual bytes only if the normalized key is equal. Thus this normalized key must be
 * consistent with the comparison result.
 */
public static int normalize(byte[] bytes, int start) {
  int len = getUTFLength(bytes, start);
  long nk = 0;
  int offset = start + getNumBytesToStoreLength(len);
  for (int i = 0; i < 2; ++i) {
    nk <<= 16;
    if (i < len) {
      nk += (charAt(bytes, offset)) & 0xffff;
      offset += charSize(bytes, offset);
    }
  }
  return (int) (nk >> 1); // make it always positive.
}

int currentTokenStart = byteIndex;
while (byteIndex < sentenceEndOffset && !isSeparator(UTF8StringUtil.charAt(sentenceBytes, byteIndex))) {
  byteIndex += UTF8StringUtil.charSize(sentenceBytes, byteIndex);
  tokenLength++;
          break;
        offset += UTF8StringUtil.charSize(sentenceBytes, currentTokenStart + offset);

@Override
public boolean equals(Object o) {
  if (o == null) {
    return false;
  }
  if (!(o instanceof IToken)) {
    return false;
  }
  IToken t = (IToken) o;
  if (t.getTokenLength() != tokenLength) {
    return false;
  }
  int offset = 0;
  for (int i = 0; i < tokenLength; i++) {
    if (UTF8StringUtil.charAt(t.getData(), t.getStartOffset() + offset) != UTF8StringUtil.charAt(data,
        startOffset + offset)) {
      return false;
    }
    offset += UTF8StringUtil.charSize(data, startOffset + offset);
  }
  return true;
}

@Test
public void testCharAtCharSizeGetLen() throws Exception {
  char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
  byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
  int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
  for (char c : utf8Mix) {
    assertEquals(c, charAt(buffer, pos));
    assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
    pos += charSize(buffer, pos);
  }
}

protected void serializeToken(UTF8StringBuilder builder, GrowableArray out, int numPreChars, int numPostChars,
    char preChar, char postChar) throws IOException {
  handleTokenTypeTag(out.getDataOutput());
  assert UTF8StringUtil.getModifiedUTF8Len(preChar) == 1 && UTF8StringUtil.getModifiedUTF8Len(postChar) == 1;
  int actualUtfLen = endOffset - startOffset;
  builder.reset(out, actualUtfLen + numPreChars + numPostChars);
  // pre chars
  for (int i = 0; i < numPreChars; i++) {
    builder.appendChar(preChar);
  }
  /// regular chars
  int numRegChars = tokenLength - numPreChars - numPostChars;
  int pos = startOffset;
  for (int i = 0; i < numRegChars; i++) {
    char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
    builder.appendChar(c);
    pos += UTF8StringUtil.charSize(data, pos);
  }
  // post chars
  for (int i = 0; i < numPostChars; i++) {
    builder.appendChar(postChar);
  }
  builder.finish();
}

/**
 * Generates a reversed string from an input source string
 *
 * @param srcPtr
 *            , the input source string.
 * @param builder
 *            , a builder for the resulting string.
 * @param out
 *            , the storage for a result string.
 * @throws IOException
 */
public static void reverse(UTF8StringPointable srcPtr, UTF8StringBuilder builder, GrowableArray out)
    throws IOException {
  builder.reset(out, srcPtr.getUTF8Length());
  int srcStart = srcPtr.getCharStartOffset();
  int srcEnd = srcPtr.getStartOffset() + srcPtr.getLength() - 1;
  for (int cursorIndex = srcEnd; cursorIndex >= srcStart; cursorIndex--) {
    if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
      int charSize = UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
      builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize);
    }
  }
  builder.finish();
}

How to use charSizemethodin org.apache.hyracks.util.string.UTF8StringUtil

Best Java code snippets using org.apache.hyracks.util.string.UTF8StringUtil.charSize (Showing top 20 results out of 315)

How to use
charSize
method
in
org.apache.hyracks.util.string.UTF8StringUtil