/** * Compute the normalized key of the UTF8 string. * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data. * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars. * The comparator will first use this integer to get the result ( <,>, or =), it will check * the actual bytes only if the normalized key is equal. Thus this normalized key must be * consistent with the comparison result. */ public static int normalize(byte[] bytes, int start) { int len = getUTFLength(bytes, start); long nk = 0; int offset = start + getNumBytesToStoreLength(len); for (int i = 0; i < 2; ++i) { nk <<= 16; if (i < len) { nk += (charAt(bytes, offset)) & 0xffff; offset += charSize(bytes, offset); } } return (int) (nk >> 1); // make it always positive. }
@Override public void reset(byte[] data, int startOff) throws HyracksDataException { this.data = data; this.startOffset = startOff; this.length = UTF8StringUtil.getStringLength(data, startOffset); this.utfByteLength = UTF8StringUtil.getUTFLength(data, startOffset); this.metaLength = UTF8StringUtil.getNumBytesToStoreLength(utfByteLength); reset(); }
public void toString(StringBuilder buffer) { UTF8StringUtil.toString(buffer, bytes, start); }
private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) { int utflen = getUTFLength(bytes, start); int sStart = start + getNumBytesToStoreLength(utflen); return hash(bytes, sStart, utflen, useLowerCase, useRawByte, coefficient, r); }
public static int getStringLength(byte[] b, int s) { int len = getUTFLength(b, s); int pos = s + getNumBytesToStoreLength(len); int end = pos + len; int charCount = 0; while (pos < end) { charCount++; pos += charSize(b, pos); } return charCount; }
private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart, boolean useLowerCase, boolean useRawByte) { int thisLength = getUTFLength(thisBytes, thisStart); int thatLength = getUTFLength(thatBytes, thatStart); int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength); int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength); return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, thatActualStart, thatLength, useLowerCase, useRawByte); }
@Test public void testCharAtCharSizeGetLen() throws Exception { char[] utf8Mix = STRING_UTF8_MIX.toCharArray(); byte[] buffer = writeStringToBytes(STRING_UTF8_MIX); int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0)); for (char c : utf8Mix) { assertEquals(c, charAt(buffer, pos)); assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos)); pos += charSize(buffer, pos); } }
/** * reset those meta length. * Since the {@code utf8Length} and the {@code metaLength} are often used, we compute those two values in advance. * As for the {@code stringLength} and the {@code hashValue}, they will be lazily initialized after the first call. */ @Override protected void afterReset() { utf8Length = UTF8StringUtil.getUTFLength(bytes, start); metaLength = UTF8StringUtil.getNumBytesToStoreLength(getUTF8Length()); hashValue = 0; stringLength = -1; }
private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte, int coefficient, int r) { int h = 0; int c = 0; while (c < length) { char ch; if (useRawByte) { ch = (char) bytes[start + c]; } else { ch = charAt(bytes, start + c); if (useLowerCase) { ch = Character.toLowerCase(ch); } } h = (coefficient * h + ch) % r; c += charSize(bytes, start + c); } return h; }
public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) { int utfLen = getUTFLength(bytes, start); int offset = getNumBytesToStoreLength(utfLen); while (utfLen > 0) { char c = charAt(bytes, start + offset); builder.append(c); int cLen = getModifiedUTF8Len(c); offset += cLen; utfLen -= cLen; } return builder; }
public int getLowerCaseUTF8Len(int limit) { int lowerCaseUTF8Len = 0; int pos = startOffset; for (int i = 0; i < limit; i++) { char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos)); lowerCaseUTF8Len += UTF8StringUtil.getModifiedUTF8Len(c); pos += UTF8StringUtil.charSize(data, pos); } return lowerCaseUTF8Len; }
public void testCompare(String str1, String str2, OPTION option) throws IOException { byte[] buffer1 = writeStringToBytes(str1); byte[] buffer2 = writeStringToBytes(str2); switch (option) { case STANDARD: assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0)); int n1 = normalize(buffer1, 0); int n2 = normalize(buffer2, 0); assertTrue(isSameSign(str1.compareTo(str2), n1 - n2)); break; case RAW_BYTE: assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 0, buffer2, 0)); break; case LOWERCASE: assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0)); break; } }
@Test public void testHash() throws IOException { byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE); int lowerHash = hash(buffer, 0); buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE); int upperHash = lowerCaseHash(buffer, 0); assertEquals(lowerHash, upperHash); int familyOne = hash(buffer, 0, 7, 297); int familyTwo = hash(buffer, 0, 8, 297); assertTrue(familyOne != familyTwo); }
@Override public void next() throws HyracksDataException { pos += UTF8StringUtil.charSize(data, pos); }
/** * Gets the length of the string in characters. * The first time call will need to go through the entire string, the following call will just return the pre-caculated result * * @return length of string in characters */ public int getStringLength() { if (stringLength < 0) { stringLength = UTF8StringUtil.getStringLength(bytes, start); } return stringLength; }
@Override public int hash(byte[] bytes, int offset, int length) { return UTF8StringUtil.hash(bytes, offset, coefficient, r); } };
@Test public void testGetStringLength() throws Exception { byte[] buffer = writeStringToBytes(STRING_UTF8_MIX); assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0)); }
@Test public void testToString() throws Exception { StringBuilder sb = new StringBuilder(); byte[] buffer = writeStringToBytes(STRING_UTF8_MIX); assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 0).toString()); }
/** * Returns the character at the given byte offset. The caller is responsible for making sure that * the provided offset is within bounds and points to the beginning of a valid UTF8 character. * * @param offset * - Byte offset * @return Character at the given offset. */ public char charAt(int offset) { return UTF8StringUtil.charAt(bytes, start + offset); }
public static void writeUTF8(char[] buffer, int start, int length, DataOutput out, UTF8StringWriter writer) throws IOException { int utflen = 0; int count = 0; char c; for (int i = 0; i < length; i++) { c = buffer[i + start]; utflen += UTF8StringUtil.getModifiedUTF8Len(c); } byte[] tempBytes = getTempBytes(writer, utflen); count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count); int i = 0; for (; i < length; i++) { c = buffer[i + start]; if (!((c >= 0x0001) && (c <= 0x007F))) { break; } tempBytes[count++] = (byte) c; } for (; i < length; i++) { c = buffer[i + start]; count += writeToBytes(tempBytes, count, c); } out.write(tempBytes, 0, count); }