Refine search
INVALID_FUNCTION_ARGUMENT, "Target length must be in the range [0.." + Integer.MAX_VALUE + "]"); checkCondition(padString.length() > 0, INVALID_FUNCTION_ARGUMENT, "Padding string must not be empty"); int textLength = countCodePoints(text); int resultLength = (int) targetLength; return SliceUtf8.substring(text, 0, resultLength); int padStringLength = countCodePoints(padString); int[] padStringCounts = new int[padStringLength]; for (int i = 0; i < padStringLength; ++i) { padStringCounts[i] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, i)); int bufferSize = text.length(); for (int i = 0; i < resultLength - textLength; ++i) { bufferSize += padStringCounts[i % padStringLength]; Slice buffer = Slices.allocate(bufferSize); int countBytes = bufferSize - text.length(); int startPointOfExistingText = (paddingOffset + countBytes) % bufferSize; buffer.setBytes(startPointOfExistingText, text);
private static int safeCountCodePoints(Slice slice) { int codePoints = 0; for (int position = 0; position < slice.length(); ) { int codePoint = tryGetCodePointAt(slice, position); if (codePoint < 0) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding in characters: " + slice.toStringUtf8()); } position += lengthOfCodePoint(codePoint); codePoints++; } return codePoints; }
@Test public void testGetMinSlice() Slice minSlice = utf8Slice(""); continue; Slice value = codePointToUtf8(codePoint); if (findStringStatisticTruncationPositionForOriginalOrcWriter(value) == value.length()) { assertEquals(minStringTruncateToValidRange(value, ORIGINAL), value); assertEquals(minStringTruncateToValidRange(value, ORIGINAL), minSlice); Slice prefix = utf8Slice("apple"); for (int codePoint = startCodePoint; codePoint < endCodePoint; codePoint++) { if (MIN_SURROGATE <= codePoint && codePoint <= MAX_SURROGATE) { continue; Slice value = concatSlice(prefix, codePointToUtf8(codePoint)); if (findStringStatisticTruncationPositionForOriginalOrcWriter(value) == value.length()) { assertEquals(minStringTruncateToValidRange(value, ORIGINAL), value);
private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
@Test public void testToStringStatistics() assertNull(DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORIGINAL, DwrfProto.StringStatistics.newBuilder() assertEquals( DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, assertEquals( DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORIGINAL, .build(), true), new StringStatistics(null, Slices.utf8Slice("cat"), 0)); .build(), true), new StringStatistics(Slices.utf8Slice("ant"), Slices.utf8Slice("cat"), 79)); Slice codePoint = codePointToUtf8(testCodePoint); for (Slice suffix : ALL_UTF8_SEQUENCES) { Slice testValue = concatSlice(prefix, codePoint, suffix);
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated3() { getCodePointAt(wrappedBuffer((byte) 'x', START_3_BYTE, CONTINUATION_BYTE), 1); }
@Test public void testLengthOfCodePoint() { assertEquals(lengthOfCodePointFromStartByte(START_1_BYTE), 1); assertEquals(lengthOfCodePointFromStartByte(START_2_BYTE), 2); assertEquals(lengthOfCodePointFromStartByte(START_3_BYTE), 3); assertEquals(lengthOfCodePointFromStartByte(START_4_BYTE), 4); for (int codePoint : ALL_CODE_POINTS) { String string = new String(new int[] {codePoint}, 0, 1); assertEquals(string.codePoints().count(), 1); Slice utf8 = wrappedBuffer(string.getBytes(UTF_8)); assertEquals(lengthOfCodePoint(codePoint), utf8.length()); assertEquals(lengthOfCodePoint(utf8, 0), utf8.length()); assertEquals(lengthOfCodePointSafe(utf8, 0), utf8.length()); assertEquals(lengthOfCodePointFromStartByte(utf8.getByte(0)), utf8.length()); assertEquals(getCodePointAt(utf8, 0), codePoint); assertEquals(getCodePointBefore(utf8, utf8.length()), codePoint); assertEquals(codePointToUtf8(codePoint), utf8); } for (byte[] sequence : INVALID_SEQUENCES) { assertEquals(lengthOfCodePointSafe(wrappedBuffer(sequence), 0), sequence.length); assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(new byte[] {'x'}, sequence)), 1), sequence.length); assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(sequence, new byte[] {'x'})), 0), sequence.length); } }
/** * Test invalid UTF8 encodings. We do not expect a 'correct' but none harmful result. */ @Test public void testInvalidUtf8() { assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_1)), 0); assertEquals(countCodePoints(wrappedBuffer(INVALID_UTF8_2)), 3); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 0), 0); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_1), 1), -1); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 0), 0); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 1), 2); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 2), 3); assertEquals(offsetOfCodePoint(wrappedBuffer(INVALID_UTF8_2), 3), -1); }
private static void testMaxStringTruncateAtFirstReplacementCharacter(Slice prefix, Slice suffix) { for (int testCodePoint : TEST_CODE_POINTS) { Slice codePoint = codePointToUtf8(testCodePoint); Slice value = concatSlice(prefix, codePoint, suffix); assertEquals(maxStringTruncateToValidRange(value, ORC_HIVE_8732), value); // For ORIGINAL, skip prefixes that truncate if (prefix.equals(maxStringTruncateToValidRange(prefix, ORIGINAL))) { if (testCodePoint == REPLACEMENT_CHARACTER_CODE_POINT || testCodePoint >= MIN_SUPPLEMENTARY_CODE_POINT) { // truncate at test code point assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), concatSlice(prefix, wrappedBuffer((byte) 0xFF))); } else { // truncate in suffix (if at all) assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), concatSlice(prefix, codePoint, maxStringTruncateToValidRange(suffix, ORIGINAL))); } } } }
@Test(expectedExceptions = InvalidCodePointException.class, expectedExceptionsMessageRegExp = "Invalid code point 0xFFFFFFFF") public void testLengthOfNegativeCodePoint() { lengthOfCodePoint(-1); }
private static void assertReverse(String string) { Slice actualReverse = reverse(utf8Slice(string)); int[] codePoints = string.codePoints().toArray(); codePoints = Ints.toArray(Lists.reverse(Ints.asList(codePoints))); Slice expectedReverse = wrappedBuffer(new String(codePoints, 0, codePoints.length).getBytes(UTF_8)); assertEquals(actualReverse, expectedReverse); }
private static void assertReverseWithInvalidSequence(byte[] invalidSequence) { assertEquals( reverse(wrappedBuffer(invalidSequence)), wrappedBuffer(invalidSequence)); assertEquals( reverse(wrappedBuffer(concat(new byte[] {'a', 'b', 'c'}, invalidSequence))), wrappedBuffer(concat(invalidSequence, new byte[] {'c', 'b', 'a'}))); assertEquals( reverse(wrappedBuffer(concat(invalidSequence, new byte[] {'x', 'y', 'z'}))), wrappedBuffer(concat(new byte[] {'z', 'y', 'x'}, invalidSequence))); assertEquals( reverse(wrappedBuffer(concat(new byte[] {'a', 'b', 'c'}, invalidSequence, new byte[] {'x', 'y', 'z'}))), wrappedBuffer(concat(new byte[] {'z', 'y', 'x'}, invalidSequence, new byte[] {'c', 'b', 'a'}))); }
private static void assertFixInvalidUtf8(Slice testSlice, Slice expectedSlice) { assertEquals(fixInvalidUtf8(testSlice), expectedSlice); }
private static void assertCodePointCount(String string) { assertEquals(countCodePoints(utf8Slice(string)), string.codePoints().count()); }
private static void testMinStringTruncateAtFirstReplacementCharacter(Slice prefix, Slice suffix) { for (int testCodePoint : TEST_CODE_POINTS) { Slice codePoint = codePointToUtf8(testCodePoint); Slice value = concatSlice(prefix, codePoint, suffix); assertEquals(minStringTruncateToValidRange(value, ORC_HIVE_8732), value); // For ORIGINAL, skip prefixes that truncate if (prefix.equals(minStringTruncateToValidRange(prefix, ORIGINAL))) { if (testCodePoint == REPLACEMENT_CHARACTER_CODE_POINT || testCodePoint >= MIN_SUPPLEMENTARY_CODE_POINT) { // truncate at test code point assertEquals(minStringTruncateToValidRange(value, ORIGINAL), prefix); } else { // truncate in suffix (if at all) assertEquals(minStringTruncateToValidRange(value, ORIGINAL), concatSlice(prefix, codePoint, minStringTruncateToValidRange(suffix, ORIGINAL))); } } } }
public static Slice varcharPartitionKey(String value, String name, Type columnType) { Slice partitionKey = Slices.utf8Slice(value); VarcharType varcharType = (VarcharType) columnType; if (SliceUtf8.countCodePoints(partitionKey) > varcharType.getLength()) { throw new PrestoException(HIVE_INVALID_PARTITION_VALUE, format("Invalid partition value '%s' for %s partition key: %s", value, columnType.toString(), name)); } return partitionKey; }
if (search.length() == 0) { Slice buffer = Slices.allocate((countCodePoints(str) + 1) * replace.length() + str.length()); buffer.setBytes(0, replace); while (index < str.length()) { int codePointLength = lengthOfCodePointSafe(str, index); buffer.setBytes(indexBuffer, str, index, codePointLength); Slice buffer = Slices.allocate(str.length()); int bytesToCopy = str.length() - index; buffer = Slices.ensureSize(buffer, indexBuffer + bytesToCopy); buffer.setBytes(indexBuffer, str, index, bytesToCopy); indexBuffer += bytesToCopy;
if (delimiter.length() == 0) { int startCodePoint = toIntExact(index); int indexStart = offsetOfCodePoint(string, startCodePoint - 1); if (indexStart < 0) { int length = lengthOfCodePoint(string, indexStart); if (indexStart + length > string.length()) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding"); return string.slice(indexStart, length);
public static Slice padSpaces(Slice slice, int length) { int textLength = countCodePoints(slice); // if our string is bigger than requested then truncate if (textLength > length) { throw new IllegalArgumentException("pad length is smaller than slice length"); } // if our target length is the same as our string then return our string if (textLength == length) { return slice; } // preallocate the result int bufferSize = slice.length() + length - textLength; Slice buffer = Slices.allocate(bufferSize); // fill in the existing string buffer.setBytes(0, slice); // fill padding spaces for (int i = slice.length(); i < bufferSize; ++i) { buffer.setByte(i, ' '); } return buffer; }