private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
private static Slice codePointsToSliceUtf8(List<Integer> codePoints) { int length = codePoints.stream() .mapToInt(SliceUtf8::lengthOfCodePoint) .sum(); Slice result = Slices.wrappedBuffer(new byte[length]); int offset = 0; for (int codePoint : codePoints) { setCodePointAt(codePoint, result, offset); offset += lengthOfCodePoint(codePoint); } return result; } }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
position += lengthOfCodePoint(codePoint);
private static int safeCountCodePoints(Slice slice) { int codePoints = 0; for (int position = 0; position < slice.length(); ) { int codePoint = tryGetCodePointAt(slice, position); if (codePoint < 0) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding in characters: " + slice.toStringUtf8()); } position += lengthOfCodePoint(codePoint); codePoints++; } return codePoints; }
@Description("computes Hamming distance between two strings") @ScalarFunction @LiteralParameters({"x", "y"}) @SqlType(StandardTypes.BIGINT) public static long hammingDistance(@SqlType("varchar(x)") Slice left, @SqlType("varchar(y)") Slice right) { int distance = 0; int leftPosition = 0; int rightPosition = 0; while (leftPosition < left.length() && rightPosition < right.length()) { int codePointLeft = tryGetCodePointAt(left, leftPosition); int codePointRight = tryGetCodePointAt(right, rightPosition); // if both code points are invalid, we do not care if they are equal // the following code treats them as equal if they happen to be of the same length if (codePointLeft != codePointRight) { distance++; } leftPosition += codePointLeft > 0 ? lengthOfCodePoint(codePointLeft) : -codePointLeft; rightPosition += codePointRight > 0 ? lengthOfCodePoint(codePointRight) : -codePointRight; } checkCondition( leftPosition == left.length() && rightPosition == right.length(), INVALID_FUNCTION_ARGUMENT, "The input strings to hamming_distance function must have the same length"); return distance; }
int length = lengthOfCodePoint(string, indexStart); if (indexStart + length > string.length()) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding");
private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
@Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF") public void testLengthOfNegativeCodePoint() { lengthOfCodePoint(-1); }
@Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0x110000") public void testLengthOfOutOfRangeCodePoint() { lengthOfCodePoint(MAX_CODE_POINT + 1); }
@Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF") public void testLengthOfNegativeCodePoint() { lengthOfCodePoint(-1); }
@Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0x110000") public void testLengthOfOutOfRangeCodePoint() { lengthOfCodePoint(MAX_CODE_POINT + 1); }
/** * Convert the code point to UTF-8. * <p> * * @throws InvalidCodePointException if code point is not within a valid range */ public static Slice codePointToUtf8(int codePoint) { Slice utf8 = Slices.allocate(lengthOfCodePoint(codePoint)); setCodePointAt(codePoint, utf8, 0); return utf8; }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
/** * Convert the code point to UTF-8. * <p> * * @throws InvalidCodePointException if code point is not within a valid range */ public static Slice codePointToUtf8(int codePoint) { Slice utf8 = Slices.allocate(lengthOfCodePoint(codePoint)); setCodePointAt(codePoint, utf8, 0); return utf8; }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
private static int safeCountCodePoints(Slice slice) { int codePoints = 0; for (int position = 0; position < slice.length(); ) { int codePoint = tryGetCodePointAt(slice, position); if (codePoint < 0) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding in characters: " + slice.toStringUtf8()); } position += lengthOfCodePoint(codePoint); codePoints++; } return codePoints; }
@Benchmark public int benchmarkOffsetByCodePoints(BenchmarkData data) { Slice slice = data.getSlice(); int offset = offsetOfCodePoint(slice, data.getLength() - 1); if (offset + lengthOfCodePoint(slice, offset) != slice.length()) { throw new AssertionError(); } return offset; }
@Benchmark public int benchmarkOffsetByCodePoints(BenchmarkData data) { Slice slice = data.getSlice(); int offset = offsetOfCodePoint(slice, data.getLength() - 1); if (offset + lengthOfCodePoint(slice, offset) != slice.length()) { throw new AssertionError(); } return offset; }