private static Map<Integer, ColumnStatistics> getStatisticsByColumnOrdinal(OrcType rootStructType, List<ColumnStatistics> fileStats) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(fileStats, "fileStats is null"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { if (fileStats.size() > ordinal) { ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); if (element != null) { statistics.put(ordinal, element); } } } return statistics.build(); }
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(columnIndexes, "columnIndexes is null"); checkArgument(rowGroup >= 0, "rowGroup is negative"); Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>(); for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) { groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()) .add(entry.getValue().get(rowGroup).getColumnStatistics()); } ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal)); if (columnStatistics != null) { if (columnStatistics.size() == 1) { statistics.put(ordinal, getOnlyElement(columnStatistics)); } else { // Merge statistics from different streams // This can happen if map is represented as struct (DWRF only) statistics.put(ordinal, mergeColumnStatistics(columnStatistics)); } } } return statistics.build(); }
private static List<ColumnEncoding> toColumnEncoding(List<OrcType> types, List<DwrfProto.ColumnEncoding> columnEncodings) { Map<Integer, List<DwrfProto.ColumnEncoding>> groupedColumnEncodings = new HashMap<>(columnEncodings.size()); for (int i = 0; i < columnEncodings.size(); i++) { DwrfProto.ColumnEncoding columnEncoding = columnEncodings.get(i); int column = columnEncoding.getColumn(); // DWRF prior to version 6.0.8 doesn't set the value of column, infer it from the index if (!columnEncoding.hasColumn()) { column = i; } groupedColumnEncodings.computeIfAbsent(column, key -> new ArrayList<>()).add(columnEncoding); } ImmutableList.Builder<ColumnEncoding> resultBuilder = ImmutableList.builder(); for (Map.Entry<Integer, List<DwrfProto.ColumnEncoding>> entry : groupedColumnEncodings.entrySet()) { OrcType type = types.get(entry.getKey()); DwrfProto.ColumnEncoding columnEncoding = entry.getValue().get(0); resultBuilder.add( new ColumnEncoding( toColumnEncodingKind(type.getOrcTypeKind(), columnEncoding.getKind()), columnEncoding.getDictionarySize(), toAdditionalSequenceEncodings(entry.getValue(), type))); } return resultBuilder.build(); }
private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List<OrcType> types, OrcDataSource dataSource) { OrcType type = types.get(typeId); if (!fieldName.isEmpty()) { parentStreamName += "." + fieldName; } ImmutableList.Builder<StreamDescriptor> nestedStreams = ImmutableList.builder(); if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { for (int i = 0; i < type.getFieldCount(); ++i) { nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); } } else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); } else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); } return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); }
private Map<StreamId, ValueInputStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, ValueInputStream<?>> valueStreams = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()) .getColumnEncoding(stream.getSequence()) .getColumnEncodingKind(); // skip index and empty streams if (isIndexStream(stream) || stream.getLength() == 0) { continue; } OrcInputStream inputStream = streamsData.get(streamId); OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); } return valueStreams.build(); }
public InputStreamSources createDictionaryStreamSources(Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, InputStreamSource<?>> dictionaryStreamBuilder = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); int column = stream.getColumn(); // only process dictionary streams ColumnEncodingKind columnEncoding = columnEncodings.get(column) .getColumnEncoding(stream.getSequence()) .getColumnEncodingKind(); if (!isDictionary(stream, columnEncoding)) { continue; } // skip streams without data ValueInputStream<?> valueStream = valueStreams.get(streamId); if (valueStream == null) { continue; } OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding); InputStreamSource<?> streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint); dictionaryStreamBuilder.put(streamId, streamSource); } return new InputStreamSources(dictionaryStreamBuilder.build()); }
OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind(); Set<StreamKind> availableStreams = streamKinds.get(column);
private static Type toType(OrcType type) { Builder builder = Type.newBuilder() .setKind(toTypeKind(type.getOrcTypeKind())) .addAllSubtypes(type.getFieldTypeIndexes()) .addAllFieldNames(type.getFieldNames()); if (type.getLength().isPresent()) { builder.setMaximumLength(type.getLength().get()); } if (type.getPrecision().isPresent()) { builder.setPrecision(type.getPrecision().get()); } if (type.getScale().isPresent()) { builder.setScale(type.getScale().get()); } return builder.build(); }
private static DwrfSequenceEncoding toSequenceEncoding(OrcType type, DwrfProto.ColumnEncoding columnEncoding) { return new DwrfSequenceEncoding( columnEncoding.getKey(), new ColumnEncoding( toColumnEncodingKind(type.getOrcTypeKind(), columnEncoding.getKind()), columnEncoding.getDictionarySize())); }
private static Type toType(OrcType type) { Builder builder = Type.newBuilder() .setKind(toTypeKind(type.getOrcTypeKind())) .addAllSubtypes(type.getFieldTypeIndexes()) .addAllFieldNames(type.getFieldNames()); return builder.build(); }
switch (orcType.getOrcTypeKind()) { case BOOLEAN: return new BooleanColumnWriter(columnIndex, type, compression, bufferSize);
private static List<ColumnEncoding> toColumnEncoding(List<OrcType> types, List<OrcProto.ColumnEncoding> columnEncodings) { checkArgument(types.size() == columnEncodings.size()); ImmutableList.Builder<ColumnEncoding> encodings = ImmutableList.builder(); for (int i = 0; i < types.size(); i++) { OrcType type = types.get(i); encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i))); } return encodings.build(); }
private static Map<Integer, ColumnStatistics> getStatisticsByColumnOrdinal(OrcType rootStructType, List<ColumnStatistics> fileStats) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(fileStats, "fileStats is null"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); if (element != null) { statistics.put(ordinal, element); } } return statistics.build(); }
private static Map<Integer, ColumnStatistics> getStatisticsByColumnOrdinal(OrcType rootStructType, List<ColumnStatistics> fileStats) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(fileStats, "fileStats is null"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { if (fileStats.size() > ordinal) { ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); if (element != null) { statistics.put(ordinal, element); } } } return statistics.build(); }
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<Integer, List<RowGroupIndex>> columnIndexes, int rowGroup) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(columnIndexes, "columnIndexes is null"); checkArgument(rowGroup >= 0, "rowGroup is negative"); ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { List<RowGroupIndex> rowGroupIndexes = columnIndexes.get(rootStructType.getFieldTypeIndex(ordinal)); if (rowGroupIndexes != null) { statistics.put(ordinal, rowGroupIndexes.get(rowGroup).getColumnStatistics()); } } return statistics.build(); }
private Map<StreamId, ValueStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, ValueStream<?>> valueStreams = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); // skip index and empty streams if (isIndexStream(stream) || stream.getLength() == 0) { continue; } OrcInputStream inputStream = streamsData.get(streamId); OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); } return valueStreams.build(); }
private Map<StreamId, ValueInputStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, ValueInputStream<?>> valueStreams = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()) .getColumnEncoding(stream.getSequence()) .getColumnEncodingKind(); // skip index and empty streams if (isIndexStream(stream) || stream.getLength() == 0) { continue; } OrcInputStream inputStream = streamsData.get(streamId); OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); } return valueStreams.build(); }
private static Type toType(OrcType type) { Builder builder = Type.newBuilder() .setKind(toTypeKind(type.getOrcTypeKind())) .addAllSubtypes(type.getFieldTypeIndexes()) .addAllFieldNames(type.getFieldNames()); if (type.getLength().isPresent()) { builder.setMaximumLength(type.getLength().get()); } if (type.getPrecision().isPresent()) { builder.setPrecision(type.getPrecision().get()); } if (type.getScale().isPresent()) { builder.setScale(type.getScale().get()); } return builder.build(); }
private static DwrfSequenceEncoding toSequenceEncoding(OrcType type, DwrfProto.ColumnEncoding columnEncoding) { return new DwrfSequenceEncoding( columnEncoding.getKey(), new ColumnEncoding( toColumnEncodingKind(type.getOrcTypeKind(), columnEncoding.getKind()), columnEncoding.getDictionarySize())); }
private static Type toType(OrcType type) { Builder builder = Type.newBuilder() .setKind(toTypeKind(type.getOrcTypeKind())) .addAllSubtypes(type.getFieldTypeIndexes()) .addAllFieldNames(type.getFieldNames()); return builder.build(); }