/** * Returns the character at the given byte offset. The caller is responsible for making sure that * the provided offset is within bounds and points to the beginning of a valid UTF8 character. * * @param offset * - Byte offset * @return Character at the given offset. */ public char charAt(int offset) { return UTF8StringUtil.charAt(bytes, start + offset); }
@Override public int compare(ISequenceIterator cmpIter) throws HyracksDataException { char thisChar = Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); char thatChar = Character.toLowerCase(UTF8StringUtil.charAt(cmpIter.getData(), cmpIter.getPos())); if (thisChar == thatChar) { return 0; } return -1; }
/** * Writes a supplementary char consisting of high and low surrogates * * @return The length of the surrogates * @throws IOException */ private static int writeSupplementaryChar(OutputStream os, byte[] src, int limit, int highSurrogatePos, char highSurrogate, int highSurrogateSize) throws IOException { final int lowSurrogatePos = highSurrogatePos + highSurrogateSize; if (lowSurrogatePos >= limit) { throw new IllegalStateException("malformed utf8 input"); } final char lowSurrogate = UTF8StringUtil.charAt(src, lowSurrogatePos); final int lowSurrogateSize = UTF8StringUtil.charSize(src, lowSurrogatePos); os.write(new String(new char[] { highSurrogate, lowSurrogate }).getBytes()); return highSurrogateSize + lowSurrogateSize; }
private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte, int coefficient, int r) { int h = 0; int c = 0; while (c < length) { char ch; if (useRawByte) { ch = (char) bytes[start + c]; } else { ch = charAt(bytes, start + c); if (useLowerCase) { ch = Character.toLowerCase(ch); } } h = (coefficient * h + ch) % r; c += charSize(bytes, start + c); } return h; }
public int getLowerCaseUTF8Len(int limit) { int lowerCaseUTF8Len = 0; int pos = startOffset; for (int i = 0; i < limit; i++) { char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); lowerCaseUTF8Len += UTF8StringUtil.getModifiedUTF8Len(c); pos += UTF8StringUtil.charSize(data, pos); } return lowerCaseUTF8Len; }
private static int compareTo(byte[] thisBytes, int thisActualStart, int thisLength, byte[] thatBytes, int thatActualStart, int thatLength, boolean useLowerCase, boolean useRawByte) { int c1 = 0; int c2 = 0; while (c1 < thisLength && c2 < thatLength) { char ch1, ch2; if (useRawByte) { ch1 = (char) thisBytes[thisActualStart + c1]; ch2 = (char) thatBytes[thatActualStart + c2]; } else { ch1 = charAt(thisBytes, thisActualStart + c1); ch2 = charAt(thatBytes, thatActualStart + c2); if (useLowerCase) { ch1 = Character.toLowerCase(ch1); ch2 = Character.toLowerCase(ch2); } } if (ch1 != ch2) { return ch1 - ch2; } c1 += charSize(thisBytes, thisActualStart + c1); c2 += charSize(thatBytes, thatActualStart + c2); } return thisLength - thatLength; }
@Override public void reset(byte[] data, int startOffset, int endOffset, int tokenLength, int tokenCount) { super.reset(data, startOffset, endOffset, tokenLength, tokenCount); // pre-compute hash value using JAQL-like string hashing int pos = startOffset; hash = GOLDEN_RATIO_32; for (int i = 0; i < tokenLength; i++) { hash ^= Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); hash *= GOLDEN_RATIO_32; pos += UTF8StringUtil.charSize(data, pos); } hash += tokenCount; }
public static void writeUTF8StringAsCSV(byte[] b, int s, int l, OutputStream os) throws IOException { int stringLength = UTF8StringUtil.getUTFLength(b, s); int position = s + UTF8StringUtil.getNumBytesToStoreLength(stringLength); int maxPosition = position + stringLength; os.write('"'); while (position < maxPosition) { char c = UTF8StringUtil.charAt(b, position); int sz = UTF8StringUtil.charSize(b, position); if (c == '"') { os.write('"'); } os.write(c); position += sz; } os.write('"'); }
@Override public boolean hasNext() { // skip delimiters while (byteIndex < sentenceEndOffset && isSeparator(UTF8StringUtil.charAt(sentenceBytes, byteIndex))) { byteIndex += UTF8StringUtil.charSize(sentenceBytes, byteIndex); } return byteIndex < sentenceEndOffset; }
public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) { int utfLen = getUTFLength(bytes, start); int offset = getNumBytesToStoreLength(utfLen); while (utfLen > 0) { char c = charAt(bytes, start + offset); builder.append(c); int cLen = getModifiedUTF8Len(c); offset += cLen; utfLen -= cLen; } return builder; }
@Override public short getTokensCount() { if (!tokenCountCalculated) { tokenCount = 0; boolean previousCharIsSeparator = true; while (originalIndex < sentenceEndOffset) { if (isSeparator(UTF8StringUtil.charAt(sentenceBytes, originalIndex))) { previousCharIsSeparator = true; } else { if (previousCharIsSeparator) { tokenCount++; previousCharIsSeparator = false; } } originalIndex += UTF8StringUtil.charSize(sentenceBytes, originalIndex); } } return tokenCount; }
private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException { int stringLength = getUTFLength(b, s); int position = s + getNumBytesToStoreLength(stringLength); int maxPosition = position + stringLength; if (useQuotes) { os.write('\"'); } while (position < maxPosition) { char c = charAt(b, position); switch (c) { // escape case '\\': case '"': os.write('\\'); break; } int sz = charSize(b, position); while (sz > 0) { os.write(b[position]); position++; sz--; } } if (useQuotes) { os.write('\"'); } }
@Override public boolean equals(Object o) { if (o == null) { return false; } if (!(o instanceof IToken)) { return false; } IToken t = (IToken) o; if (t.getTokenLength() != tokenLength) { return false; } int offset = 0; for (int i = 0; i < tokenLength; i++) { if (UTF8StringUtil.charAt(t.getData(), t.getStartOffset() + offset) != UTF8StringUtil.charAt(data, startOffset + offset)) { return false; } offset += UTF8StringUtil.charSize(data, startOffset + offset); } return true; }
int pos = startOffset; for (int i = 0; i < numRegGrams; i++) { hash ^= Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); hash *= GOLDEN_RATIO_32; pos += UTF8StringUtil.charSize(data, pos);
/** * Compute the normalized key of the UTF8 string. * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data. * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars. * The comparator will first use this integer to get the result ( <,>, or =), it will check * the actual bytes only if the normalized key is equal. Thus this normalized key must be * consistent with the comparison result. */ public static int normalize(byte[] bytes, int start) { int len = getUTFLength(bytes, start); long nk = 0; int offset = start + getNumBytesToStoreLength(len); for (int i = 0; i < 2; ++i) { nk <<= 16; if (i < len) { nk += (charAt(bytes, offset)) & 0xffff; offset += charSize(bytes, offset); } } return (int) (nk >> 1); // make it always positive. }
os.write('"'); while (position < maxPosition) { char c = UTF8StringUtil.charAt(b, position); int sz = UTF8StringUtil.charSize(b, position); switch (c) {
int tokenLength = 0; int currentTokenStart = byteIndex; while (byteIndex < sentenceEndOffset && !isSeparator(UTF8StringUtil.charAt(sentenceBytes, byteIndex))) { byteIndex += UTF8StringUtil.charSize(sentenceBytes, byteIndex); tokenLength++; UTF8StringUtil.charAt(sentenceBytes, currentTokenStart + offset)) != Character .toLowerCase(UTF8StringUtil.charAt(sentenceBytes, tokenStart + offset))) { curTokenCount--; break;
protected void serializeToken(UTF8StringBuilder builder, GrowableArray out, int numPreChars, int numPostChars, char preChar, char postChar) throws IOException { handleTokenTypeTag(out.getDataOutput()); assert UTF8StringUtil.getModifiedUTF8Len(preChar) == 1 && UTF8StringUtil.getModifiedUTF8Len(postChar) == 1; int actualUtfLen = endOffset - startOffset; builder.reset(out, actualUtfLen + numPreChars + numPostChars); // pre chars for (int i = 0; i < numPreChars; i++) { builder.appendChar(preChar); } /// regular chars int numRegChars = tokenLength - numPreChars - numPostChars; int pos = startOffset; for (int i = 0; i < numRegChars; i++) { char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); builder.appendChar(c); pos += UTF8StringUtil.charSize(data, pos); } // post chars for (int i = 0; i < numPostChars; i++) { builder.appendChar(postChar); } builder.finish(); }
@Test public void testCharAtCharSizeGetLen() throws Exception { char[] utf8Mix = STRING_UTF8_MIX.toCharArray(); byte[] buffer = writeStringToBytes(STRING_UTF8_MIX); int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0)); for (char c : utf8Mix) { assertEquals(c, charAt(buffer, pos)); assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos)); pos += charSize(buffer, pos); } }