@Override public int hash(byte[] bytes, int offset, int length) { return UTF8StringUtil.hash(bytes, offset, coefficient, r); } };
/** * This function provides the raw bytes-based comparison for UTF8 strings. * Note that the comparison may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters. * But it works for single-byte character languages. */ public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) { return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, true); }
/** * Write a UTF8 String <code>str</code> into the DataOutput <code>out</code> * * @param str, * a Unicode string; * @param out, * a Data output stream. * @throws IOException */ public static void writeUTF8(CharSequence str, DataOutput out) throws IOException { writeUTF8(str, out, null); }
public static int getStringLength(byte[] b, int s) { int len = getUTFLength(b, s); int pos = s + getNumBytesToStoreLength(len); int end = pos + len; int charCount = 0; while (pos < end) { charCount++; pos += charSize(b, pos); } return charCount; }
private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) { int utflen = getUTFLength(bytes, start); int sStart = start + getNumBytesToStoreLength(utflen); return hash(bytes, sStart, utflen, useLowerCase, useRawByte, coefficient, r); }
private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart, boolean useLowerCase, boolean useRawByte) { int thisLength = getUTFLength(thisBytes, thisStart); int thatLength = getUTFLength(thatBytes, thatStart); int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength); int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength); return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, thatActualStart, thatLength, useLowerCase, useRawByte); }
/** * Returns the character at the given byte offset. The caller is responsible for making sure that * the provided offset is within bounds and points to the beginning of a valid UTF8 character. * * @param offset * - Byte offset * @return Character at the given offset. */ public char charAt(int offset) { return UTF8StringUtil.charAt(bytes, start + offset); }
@Override public void next() throws HyracksDataException { pos += UTF8StringUtil.charSize(data, pos); }
/** * Gets the length of the string in characters. * The first time call will need to go through the entire string, the following call will just return the pre-caculated result * * @return length of string in characters */ public int getStringLength() { if (stringLength < 0) { stringLength = UTF8StringUtil.getStringLength(bytes, start); } return stringLength; }
@Override public int hash() { return UTF8StringUtil.lowerCaseHash(bytes, start, length); }
@Override public int compareTo(byte[] bytes, int start, int length) { return UTF8StringUtil.rawByteCompareTo(this.bytes, this.start, bytes, start); }
@Override public void normalize(byte[] bytes, int start, int length, int[] normalizedKeys, int keyStart) { normalizedKeys[keyStart] = UTF8StringUtil.normalize(bytes, start); }
public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException { printUTF8String(b, s, l, os, true); }
@Override public int hash() { return UTF8StringUtil.rawBytehash(this.bytes, this.start); }
@Override public int hash() { if (hashValue == 0) { hashValue = UTF8StringUtil.hash(this.bytes, this.start); } return hashValue; }
@Override public int compareTo(byte[] bytes, int start, int length) { return UTF8StringUtil.compareTo(this.bytes, this.start, bytes, start); }
public final void writeUTF8(CharSequence str, DataOutput out) throws IOException { UTF8StringUtil.writeUTF8(str, out, this); }
@Override public int hash() { return UTF8StringUtil.lowerCaseHash(bytes, start); }
public static void printUTF8StringWithQuotes(String str, OutputStream os) throws IOException { printUTF8String(str, os, true); }
/** * This function provides the raw bytes-based hash function for UTF8 strings. * Note that the hash values may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters. * But it works for single-byte character languages. */ public static int rawBytehash(byte[] bytes, int start) { return hash(bytes, start, false, true, 31, Integer.MAX_VALUE); }