private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
@Description("returns Unicode code point of a single character string") @ScalarFunction("codepoint") @SqlType(StandardTypes.INTEGER) public static long codepoint(@SqlType("varchar(1)") Slice slice) { checkCondition(countCodePoints(slice) == 1, INVALID_FUNCTION_ARGUMENT, "Input string must be a single character string"); return getCodePointAt(slice, 0); }
@Description("decodes the UTF-8 encoded string") @ScalarFunction @LiteralParameters("x") @SqlType(StandardTypes.VARCHAR) public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice, @SqlType("varchar(x)") Slice replacementCharacter) { int count = countCodePoints(replacementCharacter); if (count > 1) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Replacement character string must empty or a single character"); } OptionalInt replacementCodePoint; if (count == 1) { try { replacementCodePoint = OptionalInt.of(getCodePointAt(replacementCharacter, 0)); } catch (InvalidUtf8Exception e) { throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid replacement character"); } } else { replacementCodePoint = OptionalInt.empty(); } return SliceUtf8.fixInvalidUtf8(slice, replacementCodePoint); }
private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
private static List<Integer> toCodePoints(Slice slice) { ImmutableList.Builder<Integer> codePoints = ImmutableList.builder(); for (int offset = 0; offset < slice.length(); ) { int codePoint = getCodePointAt(slice, offset); offset += lengthOfCodePoint(slice, offset); codePoints.add(codePoint); } return codePoints.build(); }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
private static int[] castToCodePoints(Slice slice) { int[] codePoints = new int[safeCountCodePoints(slice)]; int position = 0; for (int index = 0; index < codePoints.length; index++) { codePoints[index] = getCodePointAt(slice, position); position += lengthOfCodePoint(slice, position); } return codePoints; }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated3() { getCodePointAt(wrappedBuffer((byte) 'x', START_3_BYTE, CONTINUATION_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated3() { getCodePointAt(wrappedBuffer((byte) 'x', START_3_BYTE, CONTINUATION_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFB of code point") public void testCodePointAt5ByteSequence() { getCodePointAt(wrappedBuffer((byte) 'x', START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated2() { getCodePointAt(wrappedBuffer((byte) 'x', START_2_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated4() { getCodePointAt(wrappedBuffer((byte) 'x', START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "Illegal start 0xFB of code point") public void testCodePointAt5ByteSequence() { getCodePointAt(wrappedBuffer((byte) 'x', START_5_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated4() { getCodePointAt(wrappedBuffer((byte) 'x', START_4_BYTE, CONTINUATION_BYTE, CONTINUATION_BYTE), 1); }
@Test(expectedExceptions = InvalidUtf8Exception.class, expectedExceptionsMessageRegExp = "UTF-8 sequence truncated") public void testCodePointAtTruncated2() { getCodePointAt(wrappedBuffer((byte) 'x', START_2_BYTE), 1); }
@Description("returns Unicode code point of a single character string") @ScalarFunction("codepoint") @SqlType(StandardTypes.INTEGER) public static long codepoint(@SqlType("varchar(1)") Slice slice) { checkCondition(countCodePoints(slice) == 1, INVALID_FUNCTION_ARGUMENT, "Input string must be a single character string"); return getCodePointAt(slice, 0); }
@Description("returns Unicode code point of a single character string") @ScalarFunction("codepoint") @SqlType(StandardTypes.INTEGER) public static long codepoint(@SqlType("varchar(1)") Slice slice) { checkCondition(countCodePoints(slice) == 1, INVALID_FUNCTION_ARGUMENT, "Input string must be a single character string"); return getCodePointAt(slice, 0); }
@Test public void testLengthOfCodePoint() { assertEquals(lengthOfCodePointFromStartByte(START_1_BYTE), 1); assertEquals(lengthOfCodePointFromStartByte(START_2_BYTE), 2); assertEquals(lengthOfCodePointFromStartByte(START_3_BYTE), 3); assertEquals(lengthOfCodePointFromStartByte(START_4_BYTE), 4); for (int codePoint : ALL_CODE_POINTS) { String string = new String(new int[] {codePoint}, 0, 1); assertEquals(string.codePoints().count(), 1); Slice utf8 = wrappedBuffer(string.getBytes(UTF_8)); assertEquals(lengthOfCodePoint(codePoint), utf8.length()); assertEquals(lengthOfCodePoint(utf8, 0), utf8.length()); assertEquals(lengthOfCodePointSafe(utf8, 0), utf8.length()); assertEquals(lengthOfCodePointFromStartByte(utf8.getByte(0)), utf8.length()); assertEquals(getCodePointAt(utf8, 0), codePoint); assertEquals(getCodePointBefore(utf8, utf8.length()), codePoint); assertEquals(codePointToUtf8(codePoint), utf8); } for (byte[] sequence : INVALID_SEQUENCES) { assertEquals(lengthOfCodePointSafe(wrappedBuffer(sequence), 0), sequence.length); assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(new byte[] {'x'}, sequence)), 1), sequence.length); assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(sequence, new byte[] {'x'})), 0), sequence.length); } }
@Test public void testLengthOfCodePoint() { assertEquals(lengthOfCodePointFromStartByte(START_1_BYTE), 1); assertEquals(lengthOfCodePointFromStartByte(START_2_BYTE), 2); assertEquals(lengthOfCodePointFromStartByte(START_3_BYTE), 3); assertEquals(lengthOfCodePointFromStartByte(START_4_BYTE), 4); for (int codePoint : ALL_CODE_POINTS) { String string = new String(new int[] {codePoint}, 0, 1); assertEquals(string.codePoints().count(), 1); Slice utf8 = wrappedBuffer(string.getBytes(UTF_8)); assertEquals(lengthOfCodePoint(codePoint), utf8.length()); assertEquals(lengthOfCodePoint(utf8, 0), utf8.length()); assertEquals(lengthOfCodePointSafe(utf8, 0), utf8.length()); assertEquals(lengthOfCodePointFromStartByte(utf8.getByte(0)), utf8.length()); assertEquals(getCodePointAt(utf8, 0), codePoint); assertEquals(getCodePointBefore(utf8, utf8.length()), codePoint); assertEquals(codePointToUtf8(codePoint), utf8); } for (byte[] sequence : INVALID_SEQUENCES) { assertEquals(lengthOfCodePointSafe(wrappedBuffer(sequence), 0), sequence.length); assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(new byte[] {'x'}, sequence)), 1), sequence.length); assertEquals(lengthOfCodePointSafe(wrappedBuffer(concat(sequence, new byte[] {'x'})), 0), sequence.length); } }