org.apache.hyracks.util.string.UTF8StringUtil java code examples

/**
 * Compute the normalized key of the UTF8 string.
 * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
 * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars.
 * The comparator will first use this integer to get the result ( <,>, or =), it will check
 * the actual bytes only if the normalized key is equal. Thus this normalized key must be
 * consistent with the comparison result.
 */
public static int normalize(byte[] bytes, int start) {
  int len = getUTFLength(bytes, start);
  long nk = 0;
  int offset = start + getNumBytesToStoreLength(len);
  for (int i = 0; i < 2; ++i) {
    nk <<= 16;
    if (i < len) {
      nk += (charAt(bytes, offset)) & 0xffff;
      offset += charSize(bytes, offset);
    }
  }
  return (int) (nk >> 1); // make it always positive.
}

@Override
public void reset(byte[] data, int startOff) throws HyracksDataException {
  this.data = data;
  this.startOffset = startOff;
  this.length = UTF8StringUtil.getStringLength(data, startOffset);
  this.utfByteLength = UTF8StringUtil.getUTFLength(data, startOffset);
  this.metaLength = UTF8StringUtil.getNumBytesToStoreLength(utfByteLength);
  reset();
}

public void toString(StringBuilder buffer) {
  UTF8StringUtil.toString(buffer, bytes, start);
}

private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) {
  int utflen = getUTFLength(bytes, start);
  int sStart = start + getNumBytesToStoreLength(utflen);
  return hash(bytes, sStart, utflen, useLowerCase, useRawByte, coefficient, r);
}

public static int getStringLength(byte[] b, int s) {
  int len = getUTFLength(b, s);
  int pos = s + getNumBytesToStoreLength(len);
  int end = pos + len;
  int charCount = 0;
  while (pos < end) {
    charCount++;
    pos += charSize(b, pos);
  }
  return charCount;
}

private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart, boolean useLowerCase,
    boolean useRawByte) {
  int thisLength = getUTFLength(thisBytes, thisStart);
  int thatLength = getUTFLength(thatBytes, thatStart);
  int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength);
  int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength);
  return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, thatActualStart, thatLength, useLowerCase,
      useRawByte);
}

@Test
public void testCharAtCharSizeGetLen() throws Exception {
  char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
  byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
  int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
  for (char c : utf8Mix) {
    assertEquals(c, charAt(buffer, pos));
    assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
    pos += charSize(buffer, pos);
  }
}

/**
 * reset those meta length.
 * Since the {@code utf8Length} and the {@code metaLength} are often used, we compute those two values in advance.
 * As for the {@code stringLength} and the {@code hashValue}, they will be lazily initialized after the first call.
 */
@Override
protected void afterReset() {
  utf8Length = UTF8StringUtil.getUTFLength(bytes, start);
  metaLength = UTF8StringUtil.getNumBytesToStoreLength(getUTF8Length());
  hashValue = 0;
  stringLength = -1;
}

private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte,
    int coefficient, int r) {
  int h = 0;
  int c = 0;
  while (c < length) {
    char ch;
    if (useRawByte) {
      ch = (char) bytes[start + c];
    } else {
      ch = charAt(bytes, start + c);
      if (useLowerCase) {
        ch = Character.toLowerCase(ch);
      }
    }
    h = (coefficient * h + ch) % r;
    c += charSize(bytes, start + c);
  }
  return h;
}

public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) {
  int utfLen = getUTFLength(bytes, start);
  int offset = getNumBytesToStoreLength(utfLen);
  while (utfLen > 0) {
    char c = charAt(bytes, start + offset);
    builder.append(c);
    int cLen = getModifiedUTF8Len(c);
    offset += cLen;
    utfLen -= cLen;
  }
  return builder;
}

public int getLowerCaseUTF8Len(int limit) {
  int lowerCaseUTF8Len = 0;
  int pos = startOffset;
  for (int i = 0; i < limit; i++) {
    char c = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
    lowerCaseUTF8Len += UTF8StringUtil.getModifiedUTF8Len(c);
    pos += UTF8StringUtil.charSize(data, pos);
  }
  return lowerCaseUTF8Len;
}

public void testCompare(String str1, String str2, OPTION option) throws IOException {
  byte[] buffer1 = writeStringToBytes(str1);
  byte[] buffer2 = writeStringToBytes(str2);
  switch (option) {
    case STANDARD:
      assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
      int n1 = normalize(buffer1, 0);
      int n2 = normalize(buffer2, 0);
      assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
      break;
    case RAW_BYTE:
      assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 0, buffer2, 0));
      break;
    case LOWERCASE:
      assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
      break;
  }
}

@Test
public void testHash() throws IOException {
  byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
  int lowerHash = hash(buffer, 0);
  buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
  int upperHash = lowerCaseHash(buffer, 0);
  assertEquals(lowerHash, upperHash);
  int familyOne = hash(buffer, 0, 7, 297);
  int familyTwo = hash(buffer, 0, 8, 297);
  assertTrue(familyOne != familyTwo);
}

@Override
public void next() throws HyracksDataException {
  pos += UTF8StringUtil.charSize(data, pos);
}

/**
 * Gets the length of the string in characters.
 * The first time call will need to go through the entire string, the following call will just return the pre-caculated result
 *
 * @return length of string in characters
 */
public int getStringLength() {
  if (stringLength < 0) {
    stringLength = UTF8StringUtil.getStringLength(bytes, start);
  }
  return stringLength;
}

  @Override
  public int hash(byte[] bytes, int offset, int length) {
    return UTF8StringUtil.hash(bytes, offset, coefficient, r);
  }
};

@Test
public void testGetStringLength() throws Exception {
  byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
  assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
}

@Test
public void testToString() throws Exception {
  StringBuilder sb = new StringBuilder();
  byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
  assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 0).toString());
}

/**
 * Returns the character at the given byte offset. The caller is responsible for making sure that
 * the provided offset is within bounds and points to the beginning of a valid UTF8 character.
 *
 * @param offset
 *            - Byte offset
 * @return Character at the given offset.
 */
public char charAt(int offset) {
  return UTF8StringUtil.charAt(bytes, start + offset);
}

public static void writeUTF8(char[] buffer, int start, int length, DataOutput out, UTF8StringWriter writer)
    throws IOException {
  int utflen = 0;
  int count = 0;
  char c;
  for (int i = 0; i < length; i++) {
    c = buffer[i + start];
    utflen += UTF8StringUtil.getModifiedUTF8Len(c);
  }
  byte[] tempBytes = getTempBytes(writer, utflen);
  count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
  int i = 0;
  for (; i < length; i++) {
    c = buffer[i + start];
    if (!((c >= 0x0001) && (c <= 0x007F))) {
      break;
    }
    tempBytes[count++] = (byte) c;
  }
  for (; i < length; i++) {
    c = buffer[i + start];
    count += writeToBytes(tempBytes, count, c);
  }
  out.write(tempBytes, 0, count);
}

Javadoc

A helper package to operate the UTF8String in Hyracks. Most of the codes were migrated from asterix-fuzzyjoin and hyracks-storage-am-invertedindex

Most used methods

Popular in Java

Parsing JSON documents to java classes using gson
getExternalFilesDir (Context)
findViewById (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
PrintStream (java.io)
Fake signature of an existing Java class.
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
Set (java.util)
A Set is a data structure which does not allow duplicate elements.
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
Project (org.apache.tools.ant)
Central representation of an Ant project. This class defines an Ant project with all of its targets,
Top plugins for WebStorm

How to useUTF8StringUtil in org.apache.hyracks.util.string

Best Java code snippets using org.apache.hyracks.util.string.UTF8StringUtil (Showing top 20 results out of 315)

How to use
UTF8StringUtil
in
org.apache.hyracks.util.string