public int charSize(int offset) { return UTF8StringUtil.charSize(bytes, start + offset); }
@Override public void next() throws HyracksDataException { pos += UTF8StringUtil.charSize(data, pos); }
/** * Writes a supplementary char consisting of high and low surrogates * * @return The length of the surrogates * @throws IOException */ private static int writeSupplementaryChar(OutputStream os, byte[] src, int limit, int highSurrogatePos, char highSurrogate, int highSurrogateSize) throws IOException { final int lowSurrogatePos = highSurrogatePos + highSurrogateSize; if (lowSurrogatePos >= limit) { throw new IllegalStateException("malformed utf8 input"); } final char lowSurrogate = UTF8StringUtil.charAt(src, lowSurrogatePos); final int lowSurrogateSize = UTF8StringUtil.charSize(src, lowSurrogatePos); os.write(new String(new char[] { highSurrogate, lowSurrogate }).getBytes()); return highSurrogateSize + lowSurrogateSize; }
private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte, int coefficient, int r) { int h = 0; int c = 0; while (c < length) { char ch; if (useRawByte) { ch = (char) bytes[start + c]; } else { ch = charAt(bytes, start + c); if (useLowerCase) { ch = Character.toLowerCase(ch); } } h = (coefficient * h + ch) % r; c += charSize(bytes, start + c); } return h; }
@Override public void reset(byte[] sentenceData, int start, int length) { super.reset(sentenceData, start, length); gramNum = 0; int numChars = 0; int pos = byteIndex; int end = pos + sentenceUtf8Length; while (pos < end) { numChars++; pos += UTF8StringUtil.charSize(sentenceData, pos); } if (usePrePost) { totalGrams = numChars + gramLength - 1; } else { if (numChars >= gramLength) { totalGrams = numChars - gramLength + 1; } else { totalGrams = 0; } } }
public int getLowerCaseUTF8Len(int limit) { int lowerCaseUTF8Len = 0; int pos = startOffset; for (int i = 0; i < limit; i++) { char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); lowerCaseUTF8Len += UTF8StringUtil.getModifiedUTF8Len(c); pos += UTF8StringUtil.charSize(data, pos); } return lowerCaseUTF8Len; }
private static int compareTo(byte[] thisBytes, int thisActualStart, int thisLength, byte[] thatBytes, int thatActualStart, int thatLength, boolean useLowerCase, boolean useRawByte) { int c1 = 0; int c2 = 0; while (c1 < thisLength && c2 < thatLength) { char ch1, ch2; if (useRawByte) { ch1 = (char) thisBytes[thisActualStart + c1]; ch2 = (char) thatBytes[thatActualStart + c2]; } else { ch1 = charAt(thisBytes, thisActualStart + c1); ch2 = charAt(thatBytes, thatActualStart + c2); if (useLowerCase) { ch1 = Character.toLowerCase(ch1); ch2 = Character.toLowerCase(ch2); } } if (ch1 != ch2) { return ch1 - ch2; } c1 += charSize(thisBytes, thisActualStart + c1); c2 += charSize(thatBytes, thatActualStart + c2); } return thisLength - thatLength; }
@Override public void reset(byte[] data, int startOffset, int endOffset, int tokenLength, int tokenCount) { super.reset(data, startOffset, endOffset, tokenLength, tokenCount); // pre-compute hash value using JAQL-like string hashing int pos = startOffset; hash = GOLDEN_RATIO_32; for (int i = 0; i < tokenLength; i++) { hash ^= Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); hash *= GOLDEN_RATIO_32; pos += UTF8StringUtil.charSize(data, pos); } hash += tokenCount; }
public static int getStringLength(byte[] b, int s) { int len = getUTFLength(b, s); int pos = s + getNumBytesToStoreLength(len); int end = pos + len; int charCount = 0; while (pos < end) { charCount++; pos += charSize(b, pos); } return charCount; }
@Override public boolean hasNext() { // skip delimiters while (byteIndex < sentenceEndOffset && isSeparator(UTF8StringUtil.charAt(sentenceBytes, byteIndex))) { byteIndex += UTF8StringUtil.charSize(sentenceBytes, byteIndex); } return byteIndex < sentenceEndOffset; }
public static void writeUTF8StringAsCSV(byte[] b, int s, int l, OutputStream os) throws IOException { int stringLength = UTF8StringUtil.getUTFLength(b, s); int position = s + UTF8StringUtil.getNumBytesToStoreLength(stringLength); int maxPosition = position + stringLength; os.write('"'); while (position < maxPosition) { char c = UTF8StringUtil.charAt(b, position); int sz = UTF8StringUtil.charSize(b, position); if (c == '"') { os.write('"'); } os.write(c); position += sz; } os.write('"'); }
@Override public short getTokensCount() { if (!tokenCountCalculated) { tokenCount = 0; boolean previousCharIsSeparator = true; while (originalIndex < sentenceEndOffset) { if (isSeparator(UTF8StringUtil.charAt(sentenceBytes, originalIndex))) { previousCharIsSeparator = true; } else { if (previousCharIsSeparator) { tokenCount++; previousCharIsSeparator = false; } } originalIndex += UTF8StringUtil.charSize(sentenceBytes, originalIndex); } } return tokenCount; }
private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException { int stringLength = getUTFLength(b, s); int position = s + getNumBytesToStoreLength(stringLength); int maxPosition = position + stringLength; if (useQuotes) { os.write('\"'); } while (position < maxPosition) { char c = charAt(b, position); switch (c) { // escape case '\\': case '"': os.write('\\'); break; } int sz = charSize(b, position); while (sz > 0) { os.write(b[position]); position++; sz--; } } if (useQuotes) { os.write('\"'); } }
hash ^= Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); hash *= GOLDEN_RATIO_32; pos += UTF8StringUtil.charSize(data, pos);
/** * Compute the normalized key of the UTF8 string. * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data. * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars. * The comparator will first use this integer to get the result ( <,>, or =), it will check * the actual bytes only if the normalized key is equal. Thus this normalized key must be * consistent with the comparison result. */ public static int normalize(byte[] bytes, int start) { int len = getUTFLength(bytes, start); long nk = 0; int offset = start + getNumBytesToStoreLength(len); for (int i = 0; i < 2; ++i) { nk <<= 16; if (i < len) { nk += (charAt(bytes, offset)) & 0xffff; offset += charSize(bytes, offset); } } return (int) (nk >> 1); // make it always positive. }
int currentTokenStart = byteIndex; while (byteIndex < sentenceEndOffset && !isSeparator(UTF8StringUtil.charAt(sentenceBytes, byteIndex))) { byteIndex += UTF8StringUtil.charSize(sentenceBytes, byteIndex); tokenLength++; break; offset += UTF8StringUtil.charSize(sentenceBytes, currentTokenStart + offset);
@Override public boolean equals(Object o) { if (o == null) { return false; } if (!(o instanceof IToken)) { return false; } IToken t = (IToken) o; if (t.getTokenLength() != tokenLength) { return false; } int offset = 0; for (int i = 0; i < tokenLength; i++) { if (UTF8StringUtil.charAt(t.getData(), t.getStartOffset() + offset) != UTF8StringUtil.charAt(data, startOffset + offset)) { return false; } offset += UTF8StringUtil.charSize(data, startOffset + offset); } return true; }
@Test public void testCharAtCharSizeGetLen() throws Exception { char[] utf8Mix = STRING_UTF8_MIX.toCharArray(); byte[] buffer = writeStringToBytes(STRING_UTF8_MIX); int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0)); for (char c : utf8Mix) { assertEquals(c, charAt(buffer, pos)); assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos)); pos += charSize(buffer, pos); } }
protected void serializeToken(UTF8StringBuilder builder, GrowableArray out, int numPreChars, int numPostChars, char preChar, char postChar) throws IOException { handleTokenTypeTag(out.getDataOutput()); assert UTF8StringUtil.getModifiedUTF8Len(preChar) == 1 && UTF8StringUtil.getModifiedUTF8Len(postChar) == 1; int actualUtfLen = endOffset - startOffset; builder.reset(out, actualUtfLen + numPreChars + numPostChars); // pre chars for (int i = 0; i < numPreChars; i++) { builder.appendChar(preChar); } /// regular chars int numRegChars = tokenLength - numPreChars - numPostChars; int pos = startOffset; for (int i = 0; i < numRegChars; i++) { char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); builder.appendChar(c); pos += UTF8StringUtil.charSize(data, pos); } // post chars for (int i = 0; i < numPostChars; i++) { builder.appendChar(postChar); } builder.finish(); }
/** * Generates a reversed string from an input source string * * @param srcPtr * , the input source string. * @param builder * , a builder for the resulting string. * @param out * , the storage for a result string. * @throws IOException */ public static void reverse(UTF8StringPointable srcPtr, UTF8StringBuilder builder, GrowableArray out) throws IOException { builder.reset(out, srcPtr.getUTF8Length()); int srcStart = srcPtr.getCharStartOffset(); int srcEnd = srcPtr.getStartOffset() + srcPtr.getLength() - 1; for (int cursorIndex = srcEnd; cursorIndex >= srcStart; cursorIndex--) { if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) { int charSize = UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex); builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize); } } builder.finish(); }