private static int calculateTruncationLength(int maxCharacterCount, Slice slice, int offset, int length) { requireNonNull(slice, "slice is null"); if (maxCharacterCount < 0) { throw new IllegalArgumentException("Max length must be greater or equal than zero"); } if (length <= maxCharacterCount) { return length; } int indexEnd = offsetOfCodePoint(slice, offset, maxCharacterCount); if (indexEnd < 0) { return length; } return indexEnd - offset; } }
int endIndex = offsetOfCodePoint(slice, offset, codePointCount); if (endIndex < 0) {
int indexStart = offsetOfCodePoint(utf8, startCodePoint - 1); if (indexStart < 0) { int indexEnd = offsetOfCodePoint(utf8, indexStart, lengthCodePoints); if (indexEnd < 0) { int indexStart = offsetOfCodePoint(utf8, startCodePoint); int indexEnd; if (startCodePoint + lengthCodePoints < codePoints) { indexEnd = offsetOfCodePoint(utf8, indexStart, lengthCodePoints);
@Description("suffix starting at given index") @ScalarFunction @LiteralParameters("x") @SqlType("varchar(x)") public static Slice substr(@SqlType("varchar(x)") Slice utf8, @SqlType(StandardTypes.BIGINT) long start) { if ((start == 0) || utf8.length() == 0) { return Slices.EMPTY_SLICE; } int startCodePoint = Ints.saturatedCast(start); if (startCodePoint > 0) { int indexStart = offsetOfCodePoint(utf8, startCodePoint - 1); if (indexStart < 0) { // before beginning of string return Slices.EMPTY_SLICE; } int indexEnd = utf8.length(); return utf8.slice(indexStart, indexEnd - indexStart); } // negative start is relative to end of string int codePoints = countCodePoints(utf8); startCodePoint += codePoints; // before beginning of string if (startCodePoint < 0) { return Slices.EMPTY_SLICE; } int indexStart = offsetOfCodePoint(utf8, startCodePoint); int indexEnd = utf8.length(); return utf8.slice(indexStart, indexEnd - indexStart); }
int startCodePoint = toIntExact(index); int indexStart = offsetOfCodePoint(string, startCodePoint - 1); if (indexStart < 0) {
int[] padStringCounts = new int[padStringLength]; for (int i = 0; i < padStringLength; ++i) { padStringCounts[i] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, i));
/** * Finds the index of the first byte of the code point at a position, or * {@code -1} if the position is not within the slice. * <p> * Note: This method does not explicitly check for valid UTF-8, and may * return incorrect results or throw an exception for invalid UTF-8. */ public static int offsetOfCodePoint(Slice utf8, int codePointCount) { return offsetOfCodePoint(utf8, 0, codePointCount); }
/** * Finds the index of the first byte of the code point at a position, or * {@code -1} if the position is not within the slice. * <p> * Note: This method does not explicitly check for valid UTF-8, and may * return incorrect results or throw an exception for invalid UTF-8. */ public static int offsetOfCodePoint(Slice utf8, int codePointCount) { return offsetOfCodePoint(utf8, 0, codePointCount); }
private static int calculateTruncationLength(int maxCharacterCount, Slice slice, int offset, int length) { requireNonNull(slice, "slice is null"); if (maxCharacterCount < 0) { throw new IllegalArgumentException("Max length must be greater or equal than zero"); } if (length <= maxCharacterCount) { return length; } int indexEnd = offsetOfCodePoint(slice, offset, maxCharacterCount); if (indexEnd < 0) { return length; } return indexEnd - offset; } }
private static int calculateTruncationLength(int maxCharacterCount, Slice slice, int offset, int length) { requireNonNull(slice, "slice is null"); if (maxCharacterCount < 0) { throw new IllegalArgumentException("Max length must be greater or equal than zero"); } if (length <= maxCharacterCount) { return length; } int indexEnd = offsetOfCodePoint(slice, offset, maxCharacterCount); if (indexEnd < 0) { return length; } return indexEnd - offset; } }
public static Slice truncate(Slice slice, int length) { if (length < 0) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Length smaller then zero"); } else if (length == 0) { return Slices.EMPTY_SLICE; } int indexEnd = offsetOfCodePoint(slice, length); if (indexEnd < 0) { return slice; } return slice.slice(0, indexEnd); } }
/** * Gets the substring starting at {@code codePointStart} and extending for * {@code codePointLength} code points. * <p> * Note: This method does not explicitly check for valid UTF-8, and may * return incorrect results or throw an exception for invalid UTF-8. */ public static Slice substring(Slice utf8, int codePointStart, int codePointLength) { checkArgument(codePointStart >= 0, "codePointStart is negative"); checkArgument(codePointLength >= 0, "codePointLength is negative"); int indexStart = offsetOfCodePoint(utf8, codePointStart); if (indexStart < 0) { throw new IllegalArgumentException("UTF-8 does not contain " + codePointStart + " code points"); } if (codePointLength == 0) { return Slices.EMPTY_SLICE; } int indexEnd = offsetOfCodePoint(utf8, indexStart, codePointLength - 1); if (indexEnd < 0) { throw new IllegalArgumentException("UTF-8 does not contain " + (codePointStart + codePointLength) + " code points"); } indexEnd += lengthOfCodePoint(utf8, indexEnd); if (indexEnd > utf8.length()) { throw new InvalidUtf8Exception("UTF-8 is not well formed"); } return utf8.slice(indexStart, indexEnd - indexStart); }
/** * Test invalid UTF8 encodings. We do not expect a 'correct' but none harmful result. */ @Test public void testInvalidUtf8() { assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_1)), 0); assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_2)), 3); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 0), 0); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 1), -1); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 0), 0); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 1), 2); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 2), 3); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 3), -1); }
/** * Test invalid UTF8 encodings. We do not expect a 'correct' but none harmful result. */ @Test public void testInvalidUtf8() { assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_1)), 0); assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_2)), 3); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 0), 0); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 1), -1); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 0), 0); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 1), 2); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 2), 3); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 3), -1); }
private static void assertOffsetByCodePoints(String string) { Slice utf8 = utf8Slice(string); int codePoints = (int) string.codePoints().count(); int lastIndex = 0; int characterIndex = 0; for (int codePointIndex = 0; codePointIndex < codePoints; codePointIndex++) { int expectedIndex = 0; // calculate the expected index by searching forward from the last index if (codePointIndex > 0) { expectedIndex = lastIndex + lengthOfCodePoint(string.codePointAt(characterIndex)); characterIndex = string.offsetByCodePoints(characterIndex, 1); } // avoid n^2 performance for large test string if (codePointIndex < 10000) { assertEquals(offsetOfCodePoint(utf8, codePointIndex), expectedIndex); } if (codePointIndex > 0) { assertEquals(offsetOfCodePoint(utf8, lastIndex, 1), expectedIndex); } lastIndex = expectedIndex; } assertEquals(offsetOfCodePoint(utf8Slice(string), codePoints), -1); }
private static void assertOffsetByCodePoints(String string) { Slice utf8 = utf8Slice(string); int codePoints = (int) string.codePoints().count(); int lastIndex = 0; int characterIndex = 0; for (int codePointIndex = 0; codePointIndex < codePoints; codePointIndex++) { int expectedIndex = 0; // calculate the expected index by searching forward from the last index if (codePointIndex > 0) { expectedIndex = lastIndex + lengthOfCodePoint(string.codePointAt(characterIndex)); characterIndex = string.offsetByCodePoints(characterIndex, 1); } // avoid n^2 performance for large test string if (codePointIndex < 10000) { assertEquals(offsetOfCodePoint(utf8, codePointIndex), expectedIndex); } if (codePointIndex > 0) { assertEquals(offsetOfCodePoint(utf8, lastIndex, 1), expectedIndex); } lastIndex = expectedIndex; } assertEquals(offsetOfCodePoint(utf8Slice(string), codePoints), -1); }
@Benchmark public int benchmarkOffsetByCodePoints(BenchmarkData data) { Slice slice = data.getSlice(); int offset = offsetOfCodePoint(slice, data.getLength() - 1); if (offset + lengthOfCodePoint(slice, offset) != slice.length()) { throw new AssertionError(); } return offset; }
@Benchmark public int benchmarkOffsetByCodePoints(BenchmarkData data) { Slice slice = data.getSlice(); int offset = offsetOfCodePoint(slice, data.getLength() - 1); if (offset + lengthOfCodePoint(slice, offset) != slice.length()) { throw new AssertionError(); } return offset; }
@Test public void testOffsetByCodePoints() { assertEquals(offsetOfCodePoint(EMPTY_SLICE, 0), -1); assertOffsetByCodePoints(STRING_HELLO); assertOffsetByCodePoints(STRING_QUADRATICALLY); assertOffsetByCodePoints(STRING_OESTERREICH); assertOffsetByCodePoints(STRING_DULIOE_DULIOE); assertOffsetByCodePoints(STRING_FAITH_HOPE_LOVE); assertOffsetByCodePoints(STRING_NAIVE); assertOffsetByCodePoints(STRING_OO); assertOffsetByCodePoints(STRING_ASCII_CODE_POINTS); assertOffsetByCodePoints(STRING_ALL_CODE_POINTS); assertOffsetByCodePoints(STRING_ALL_CODE_POINTS_RANDOM); }
@Test public void testOffsetByCodePoints() { assertEquals(offsetOfCodePoint(EMPTY_SLICE, 0), -1); assertOffsetByCodePoints(STRING_HELLO); assertOffsetByCodePoints(STRING_QUADRATICALLY); assertOffsetByCodePoints(STRING_OESTERREICH); assertOffsetByCodePoints(STRING_DULIOE_DULIOE); assertOffsetByCodePoints(STRING_FAITH_HOPE_LOVE); assertOffsetByCodePoints(STRING_NAIVE); assertOffsetByCodePoints(STRING_OO); assertOffsetByCodePoints(STRING_ASCII_CODE_POINTS); assertOffsetByCodePoints(STRING_ALL_CODE_POINTS); assertOffsetByCodePoints(STRING_ALL_CODE_POINTS_RANDOM); }