@Override public MetadataReader createMetadataReader() { return new DwrfMetadataReader(); }
@Override public StripeFooter readStripeFooter(List<OrcType> types, InputStream inputStream) throws IOException { CodedInputStream input = CodedInputStream.newInstance(inputStream); DwrfProto.StripeFooter stripeFooter = DwrfProto.StripeFooter.parseFrom(input); return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList())); }
private static List<ColumnEncoding> toColumnEncoding(List<OrcType> types, List<DwrfProto.ColumnEncoding> columnEncodings) { Map<Integer, List<DwrfProto.ColumnEncoding>> groupedColumnEncodings = new HashMap<>(columnEncodings.size()); for (int i = 0; i < columnEncodings.size(); i++) { DwrfProto.ColumnEncoding columnEncoding = columnEncodings.get(i); int column = columnEncoding.getColumn(); // DWRF prior to version 6.0.8 doesn't set the value of column, infer it from the index if (!columnEncoding.hasColumn()) { column = i; } groupedColumnEncodings.computeIfAbsent(column, key -> new ArrayList<>()).add(columnEncoding); } ImmutableList.Builder<ColumnEncoding> resultBuilder = ImmutableList.builder(); for (Map.Entry<Integer, List<DwrfProto.ColumnEncoding>> entry : groupedColumnEncodings.entrySet()) { OrcType type = types.get(entry.getKey()); DwrfProto.ColumnEncoding columnEncoding = entry.getValue().get(0); resultBuilder.add( new ColumnEncoding( toColumnEncodingKind(type.getOrcTypeKind(), columnEncoding.getKind()), columnEncoding.getDictionarySize(), toAdditionalSequenceEncodings(entry.getValue(), type))); } return resultBuilder.build(); }
statistics.getNumberOfValues(), minAverageValueBytes, statistics.hasBucketStatistics() ? toBooleanStatistics(statistics.getBucketStatistics()) : null, statistics.hasIntStatistics() ? toIntegerStatistics(statistics.getIntStatistics()) : null, statistics.hasDoubleStatistics() ? toDoubleStatistics(statistics.getDoubleStatistics()) : null, statistics.hasStringStatistics() ? toStringStatistics(hiveWriterVersion, statistics.getStringStatistics(), isRowGroup) : null, null, null, statistics.hasBinaryStatistics() ? toBinaryStatistics(statistics.getBinaryStatistics()) : null, null);
@Override public Footer readFooter(InputStream inputStream) throws IOException { CodedInputStream input = CodedInputStream.newInstance(inputStream); OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); return new Footer( footer.getNumberOfRows(), footer.getRowIndexStride(), toStripeInformation(footer.getStripesList()), toType(footer.getTypesList()), toColumnStatistics(footer.getStatisticsList(), false)); }
assertNull(DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORIGINAL, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORIGINAL, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder() new StringStatistics(Slices.utf8Slice("ant"), null, 0)); assertEquals( DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder()
private static List<ColumnStatistics> toColumnStatistics(HiveWriterVersion hiveWriterVersion, List<DwrfProto.ColumnStatistics> columnStatistics, boolean isRowGroup) { if (columnStatistics == null) { return ImmutableList.of(); } return ImmutableList.copyOf(Iterables.transform(columnStatistics, statistics -> toColumnStatistics(hiveWriterVersion, statistics, isRowGroup))); }
private static DwrfSequenceEncoding toSequenceEncoding(OrcType type, DwrfProto.ColumnEncoding columnEncoding) { return new DwrfSequenceEncoding( columnEncoding.getKey(), new ColumnEncoding( toColumnEncodingKind(type.getOrcTypeKind(), columnEncoding.getKind()), columnEncoding.getDictionarySize())); }
@Override public PostScript readPostScript(byte[] data, int offset, int length) throws IOException { CodedInputStream input = CodedInputStream.newInstance(data, offset, length); DwrfProto.PostScript postScript = DwrfProto.PostScript.parseFrom(input); HiveWriterVersion writerVersion = postScript.hasWriterVersion() && postScript.getWriterVersion() > 0 ? ORC_HIVE_8732 : ORIGINAL; return new PostScript( ImmutableList.of(), postScript.getFooterLength(), 0, toCompression(postScript.getCompression()), postScript.getCompressionBlockSize(), writerVersion); }
private static List<ColumnEncoding> toColumnEncoding(List<OrcType> types, List<OrcProto.ColumnEncoding> columnEncodings) { checkArgument(types.size() == columnEncodings.size()); ImmutableList.Builder<ColumnEncoding> encodings = ImmutableList.builder(); for (int i = 0; i < types.size(); i++) { OrcType type = types.get(i); encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i))); } return encodings.build(); }
private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) { return new ColumnStatistics( statistics.getNumberOfValues(), toBooleanStatistics(statistics.getBucketStatistics()), toIntegerStatistics(statistics.getIntStatistics()), toDoubleStatistics(statistics.getDoubleStatistics()), toStringStatistics(statistics.getStringStatistics(), isRowGroup), null); }
private static void testStringStatisticsTruncation(Slice testValue, HiveWriterVersion version) DwrfMetadataReader.toStringStatistics( version, DwrfProto.StringStatistics.newBuilder() createExpectedStringStatistics(version, testValue, testValue, 79)); assertEquals( DwrfMetadataReader.toStringStatistics( version, DwrfProto.StringStatistics.newBuilder() createExpectedStringStatistics(version, testValue, null, 79)); assertEquals( DwrfMetadataReader.toStringStatistics( version, DwrfProto.StringStatistics.newBuilder()
private static RowGroupIndex toRowGroupIndex(HiveWriterVersion hiveWriterVersion, DwrfProto.RowIndexEntry rowIndexEntry) { List<Long> positionsList = rowIndexEntry.getPositionsList(); ImmutableList.Builder<Integer> positions = ImmutableList.builder(); for (int index = 0; index < positionsList.size(); index++) { long longPosition = positionsList.get(index); int intPosition = (int) longPosition; checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); positions.add(intPosition); } return new RowGroupIndex(positions.build(), toColumnStatistics(hiveWriterVersion, rowIndexEntry.getStatistics(), true)); }
private static ColumnEncoding toColumnEncoding(OrcTypeKind type, OrcProto.ColumnEncoding columnEncoding) { return new ColumnEncoding(toColumnEncodingKind(type, columnEncoding.getKind()), columnEncoding.getDictionarySize()); }
@Override public PostScript readPostScript(byte[] data, int offset, int length) throws IOException { CodedInputStream input = CodedInputStream.newInstance(data, offset, length); OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); return new PostScript( ImmutableList.<Integer>of(), postScript.getFooterLength(), 0, toCompression(postScript.getCompression()), postScript.getCompressionBlockSize()); }
statistics.getNumberOfValues(), minAverageValueBytes, statistics.hasBucketStatistics() ? toBooleanStatistics(statistics.getBucketStatistics()) : null, statistics.hasIntStatistics() ? toIntegerStatistics(statistics.getIntStatistics()) : null, statistics.hasDoubleStatistics() ? toDoubleStatistics(statistics.getDoubleStatistics()) : null, statistics.hasStringStatistics() ? toStringStatistics(hiveWriterVersion, statistics.getStringStatistics(), isRowGroup) : null, null, null, statistics.hasBinaryStatistics() ? toBinaryStatistics(statistics.getBinaryStatistics()) : null, null);
@Override public StripeFooter readStripeFooter(List<OrcType> types, InputStream inputStream) throws IOException { CodedInputStream input = CodedInputStream.newInstance(inputStream); DwrfProto.StripeFooter stripeFooter = DwrfProto.StripeFooter.parseFrom(input); return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList())); }
@Override public MetadataReader createMetadataReader() { return new DwrfMetadataReader(); }
assertNull(DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORIGINAL, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORIGINAL, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder() new StringStatistics(Slices.utf8Slice("ant"), null, 0)); assertEquals( DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder() DwrfMetadataReader.toStringStatistics( HiveWriterVersion.ORC_HIVE_8732, DwrfProto.StringStatistics.newBuilder()
private static List<ColumnStatistics> toColumnStatistics(HiveWriterVersion hiveWriterVersion, List<DwrfProto.ColumnStatistics> columnStatistics, boolean isRowGroup) { if (columnStatistics == null) { return ImmutableList.of(); } return ImmutableList.copyOf(Iterables.transform(columnStatistics, statistics -> toColumnStatistics(hiveWriterVersion, statistics, isRowGroup))); }