@Override public MetadataReader createMetadataReader() { return new OrcMetadataReader(); }
statistics.getNumberOfValues(), minAverageValueBytes, statistics.hasBucketStatistics() ? toBooleanStatistics(statistics.getBucketStatistics()) : null, statistics.hasIntStatistics() ? toIntegerStatistics(statistics.getIntStatistics()) : null, statistics.hasDoubleStatistics() ? toDoubleStatistics(statistics.getDoubleStatistics()) : null, statistics.hasStringStatistics() ? toStringStatistics(hiveWriterVersion, statistics.getStringStatistics(), isRowGroup) : null, statistics.hasDateStatistics() ? toDateStatistics(hiveWriterVersion, statistics.getDateStatistics(), isRowGroup) : null, statistics.hasDecimalStatistics() ? toDecimalStatistics(statistics.getDecimalStatistics()) : null, statistics.hasBinaryStatistics() ? toBinaryStatistics(statistics.getBinaryStatistics()) : null, null);
if (findStringStatisticTruncationPositionForOriginalOrcWriter(value) == value.length()) { assertEquals(minStringTruncateToValidRange(value, ORIGINAL), value); assertEquals(minStringTruncateToValidRange(value, ORIGINAL), minSlice); if (findStringStatisticTruncationPositionForOriginalOrcWriter(value) == value.length()) { assertEquals(minStringTruncateToValidRange(value, ORIGINAL), value); assertEquals(minStringTruncateToValidRange(value, ORIGINAL), prefix);
@VisibleForTesting static StringStatistics toStringStatistics(HiveWriterVersion hiveWriterVersion, DwrfProto.StringStatistics stringStatistics, boolean isRowGroup) { if (hiveWriterVersion == ORIGINAL && !isRowGroup) { return null; } Slice maximum = stringStatistics.hasMaximum() ? maxStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMaximumBytes()), hiveWriterVersion) : null; Slice minimum = stringStatistics.hasMinimum() ? minStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMinimumBytes()), hiveWriterVersion) : null; long sum = stringStatistics.hasSum() ? stringStatistics.getSum() : 0; return new StringStatistics(minimum, maximum, sum); }
private static StringStatistics createExpectedStringStatistics(HiveWriterVersion version, Slice min, Slice max, int sum) { return new StringStatistics( minStringTruncateToValidRange(min, version), maxStringTruncateToValidRange(max, version), sum); } }
private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) { return new ColumnStatistics( statistics.getNumberOfValues(), toBooleanStatistics(statistics.getBucketStatistics()), toIntegerStatistics(statistics.getIntStatistics()), toDoubleStatistics(statistics.getDoubleStatistics()), toStringStatistics(statistics.getStringStatistics(), isRowGroup), toDateStatistics(statistics.getDateStatistics(), isRowGroup)); }
if (firstSurrogateCharacter(value) == -1) { assertEquals(getMaxSlice(value), Slices.utf8Slice(value)); assertEquals(getMaxSlice(value), maxByte); Slice maxSlice = concatSlices(Slices.utf8Slice(prefix), maxByte); for (int i = startCodePoint; i < endCodePoint; i++) { String value = prefix + new String(new int[] { i }, 0, 1); if (firstSurrogateCharacter(value) == -1) { assertEquals(getMaxSlice(value), Slices.utf8Slice(value)); assertEquals(getMaxSlice(value), maxSlice);
OrcMetadataReader metadataReader = new OrcMetadataReader(); List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) { // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 if (!isRowGroup) { return null; } if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { return null; } Slice minimum = stringStatistics.hasMinimum() ? getMinSlice(stringStatistics.getMinimum()) : null; Slice maximum = stringStatistics.hasMaximum() ? getMaxSlice(stringStatistics.getMaximum()) : null; return new StringStatistics(minimum, maximum); }
assertNull(OrcMetadataReader.toStringStatistics( ORIGINAL, OrcProto.StringStatistics.newBuilder() OrcMetadataReader.toStringStatistics( ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder() OrcMetadataReader.toStringStatistics( ORIGINAL, OrcProto.StringStatistics.newBuilder() OrcMetadataReader.toStringStatistics( ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder() new StringStatistics(utf8Slice("ant"), null, 0)); assertEquals( OrcMetadataReader.toStringStatistics( ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder() OrcMetadataReader.toStringStatistics( ORC_HIVE_8732, OrcProto.StringStatistics.newBuilder()
if (findStringStatisticTruncationPositionForOriginalOrcWriter(value) == value.length()) { assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), value); assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), maxByte); if (findStringStatisticTruncationPositionForOriginalOrcWriter(value) == value.length()) { assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), value); assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), maxSlice);
@VisibleForTesting public static Slice getMaxSlice(String maximum) { if (maximum == null) { return null; } int index = firstSurrogateCharacter(maximum); if (index == -1) { return Slices.utf8Slice(maximum); } // Append 0xFF so that it is larger than maximum return concatSlices(Slices.utf8Slice(maximum.substring(0, index)), MAX_BYTE); }
@Test public void testGetMinSlice() throws Exception { int startCodePoint = MIN_CODE_POINT; int endCodePoint = MAX_CODE_POINT; Slice minSlice = Slices.utf8Slice(""); for (int i = startCodePoint; i < endCodePoint; i++) { String value = new String(new int[] { i }, 0, 1); if (firstSurrogateCharacter(value) == -1) { assertEquals(getMinSlice(value), Slices.utf8Slice(value)); } else { assertEquals(getMinSlice(value), minSlice); } } // Test with prefix String prefix = "apple"; for (int i = startCodePoint; i < endCodePoint; i++) { String value = prefix + new String(new int[] { i }, 0, 1); if (firstSurrogateCharacter(value) == -1) { assertEquals(getMinSlice(value), Slices.utf8Slice(value)); } else { assertEquals(getMinSlice(value), Slices.utf8Slice(prefix)); } } }
@VisibleForTesting public static Slice minStringTruncateToValidRange(Slice value, HiveWriterVersion version) { if (value == null) { return null; } if (version != ORIGINAL) { return value; } int index = findStringStatisticTruncationPositionForOriginalOrcWriter(value); if (index == value.length()) { return value; } return Slices.copyOf(value, 0, index); }
private static void testMaxStringTruncateAtFirstReplacementCharacter(Slice prefix, Slice suffix) { for (int testCodePoint : TEST_CODE_POINTS) { Slice codePoint = codePointToUtf8(testCodePoint); Slice value = concatSlice(prefix, codePoint, suffix); assertEquals(maxStringTruncateToValidRange(value, ORC_HIVE_8732), value); // For ORIGINAL, skip prefixes that truncate if (prefix.equals(maxStringTruncateToValidRange(prefix, ORIGINAL))) { if (testCodePoint == REPLACEMENT_CHARACTER_CODE_POINT || testCodePoint >= MIN_SUPPLEMENTARY_CODE_POINT) { // truncate at test code point assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), concatSlice(prefix, wrappedBuffer((byte) 0xFF))); } else { // truncate in suffix (if at all) assertEquals(maxStringTruncateToValidRange(value, ORIGINAL), concatSlice(prefix, codePoint, maxStringTruncateToValidRange(suffix, ORIGINAL))); } } } }
private static void testMinStringTruncateAtFirstReplacementCharacter(Slice prefix, Slice suffix) { for (int testCodePoint : TEST_CODE_POINTS) { Slice codePoint = codePointToUtf8(testCodePoint); Slice value = concatSlice(prefix, codePoint, suffix); assertEquals(minStringTruncateToValidRange(value, ORC_HIVE_8732), value); // For ORIGINAL, skip prefixes that truncate if (prefix.equals(minStringTruncateToValidRange(prefix, ORIGINAL))) { if (testCodePoint == REPLACEMENT_CHARACTER_CODE_POINT || testCodePoint >= MIN_SUPPLEMENTARY_CODE_POINT) { // truncate at test code point assertEquals(minStringTruncateToValidRange(value, ORIGINAL), prefix); } else { // truncate in suffix (if at all) assertEquals(minStringTruncateToValidRange(value, ORIGINAL), concatSlice(prefix, codePoint, minStringTruncateToValidRange(suffix, ORIGINAL))); } } } }
@VisibleForTesting public static Slice getMinSlice(String minimum) { if (minimum == null) { return null; } int index = firstSurrogateCharacter(minimum); if (index == -1) { return Slices.utf8Slice(minimum); } // truncate the string at the first surrogate character return Slices.utf8Slice(minimum.substring(0, index)); }
static StringStatistics toStringStatistics(HiveWriterVersion hiveWriterVersion, OrcProto.StringStatistics stringStatistics, boolean isRowGroup) { if (hiveWriterVersion == ORIGINAL && !isRowGroup) { return null; } Slice maximum = stringStatistics.hasMaximum() ? maxStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMaximumBytes()), hiveWriterVersion) : null; Slice minimum = stringStatistics.hasMinimum() ? minStringTruncateToValidRange(byteStringToSlice(stringStatistics.getMinimumBytes()), hiveWriterVersion) : null; long sum = stringStatistics.hasSum() ? stringStatistics.getSum() : 0; return new StringStatistics(minimum, maximum, sum); }
private static StringStatistics createExpectedStringStatistics(HiveWriterVersion version, Slice min, Slice max, int sum) { return new StringStatistics( minStringTruncateToValidRange(min, version), maxStringTruncateToValidRange(max, version), sum); }
OrcMetadataReader metadataReader = new OrcMetadataReader(); List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);