private static RowGroupIndex toRowGroupIndex(HiveWriterVersion hiveWriterVersion, RowIndexEntry rowIndexEntry) { List<Long> positionsList = rowIndexEntry.getPositionsList(); ImmutableList.Builder<Integer> positions = ImmutableList.builder(); for (int index = 0; index < positionsList.size(); index++) { long longPosition = positionsList.get(index); int intPosition = (int) longPosition; checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); positions.add(intPosition); } return new RowGroupIndex(positions.build(), toColumnStatistics(hiveWriterVersion, rowIndexEntry.getStatistics(), true)); }
private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, Map<StreamId, List<HiveBloomFilter>> bloomFilterIndexes) throws IOException { ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { Stream stream = entry.getValue(); if (stream.getStreamKind() == ROW_INDEX) { OrcInputStream inputStream = streamsData.get(entry.getKey()); List<HiveBloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey()); List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream); if (bloomFilters != null && !bloomFilters.isEmpty()) { ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder(); for (int i = 0; i < rowGroupIndexes.size(); i++) { RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i); ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics() .withBloomFilter(bloomFilters.get(i)); newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics)); } rowGroupIndexes = newRowGroupIndexes.build(); } columnIndexes.put(entry.getKey(), rowGroupIndexes); } } return columnIndexes.build(); }
private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex) { return OrcProto.RowIndexEntry.newBuilder() .addAllPositions(rowGroupIndex.getPositions().stream() .map(Integer::longValue) .collect(toList())) .setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics())) .build(); }
private static RowGroupStatistics buildActualRowGroupStatistics(int rowGroupIndex, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) { return new RowGroupStatistics( BOTH, actualRowGroupStatistics.entrySet() .stream() .collect(Collectors.toMap(entry -> entry.getKey().getColumn(), entry -> entry.getValue().get(rowGroupIndex).getColumnStatistics()))); }
List<Integer> positionsList = entry.getValue().get(rowGroupId).getPositions();
private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex) { return RowIndexEntry.newBuilder() .addAllPositions(rowGroupIndex.getPositions().stream() .map(Integer::longValue) .collect(toImmutableList())) .setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics())) .build(); }
private List<RowGroup> createRowGroups( int rowsInStripe, Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, List<RowGroupIndex>> columnIndexes, Set<Integer> selectedRowGroups, List<ColumnEncoding> encodings) throws InvalidCheckpointException { ImmutableList.Builder<RowGroup> rowGroupBuilder = ImmutableList.builder(); for (int rowGroupId : selectedRowGroups) { Map<StreamId, StreamCheckpoint> checkpoints = getStreamCheckpoints(includedOrcColumns, types, decompressor.isPresent(), rowGroupId, encodings, streams, columnIndexes); int rowOffset = rowGroupId * rowsInRowGroup; int rowsInGroup = Math.min(rowsInStripe - rowOffset, rowsInRowGroup); long minAverageRowBytes = columnIndexes .entrySet() .stream() .mapToLong(e -> e.getValue() .get(rowGroupId) .getColumnStatistics() .getMinAverageValueSizeInBytes()) .sum(); rowGroupBuilder.add(createRowGroup(rowGroupId, rowOffset, rowsInGroup, minAverageRowBytes, valueStreams, checkpoints)); } return rowGroupBuilder.build(); }
List<Integer> positionsList = entry.getValue().get(rowGroupId).getPositions();
private static RowGroupIndex toRowGroupIndex(HiveWriterVersion hiveWriterVersion, DwrfProto.RowIndexEntry rowIndexEntry) { List<Long> positionsList = rowIndexEntry.getPositionsList(); ImmutableList.Builder<Integer> positions = ImmutableList.builder(); for (int index = 0; index < positionsList.size(); index++) { long longPosition = positionsList.get(index); int intPosition = (int) longPosition; checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); positions.add(intPosition); } return new RowGroupIndex(positions.build(), toColumnStatistics(hiveWriterVersion, rowIndexEntry.getStatistics(), true)); }
private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, Map<StreamId, List<HiveBloomFilter>> bloomFilterIndexes) throws IOException { ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { Stream stream = entry.getValue(); if (stream.getStreamKind() == ROW_INDEX) { OrcInputStream inputStream = streamsData.get(entry.getKey()); List<HiveBloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey()); List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream); if (bloomFilters != null && !bloomFilters.isEmpty()) { ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder(); for (int i = 0; i < rowGroupIndexes.size(); i++) { RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i); ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics() .withBloomFilter(bloomFilters.get(i)); newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics)); } rowGroupIndexes = newRowGroupIndexes.build(); } columnIndexes.put(entry.getKey(), rowGroupIndexes); } } return columnIndexes.build(); }
private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex) { return OrcProto.RowIndexEntry.newBuilder() .addAllPositions(rowGroupIndex.getPositions().stream() .map(Integer::longValue) .collect(toList())) .setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics())) .build(); }
private static Map<Integer, ColumnStatistics> getRowGroupStatistics(OrcType rootStructType, Map<StreamId, List<RowGroupIndex>> columnIndexes, int rowGroup) { requireNonNull(rootStructType, "rootStructType is null"); checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); requireNonNull(columnIndexes, "columnIndexes is null"); checkArgument(rowGroup >= 0, "rowGroup is negative"); Map<Integer, List<ColumnStatistics>> groupedColumnStatistics = new HashMap<>(); for (Entry<StreamId, List<RowGroupIndex>> entry : columnIndexes.entrySet()) { groupedColumnStatistics.computeIfAbsent(entry.getKey().getColumn(), key -> new ArrayList<>()) .add(entry.getValue().get(rowGroup).getColumnStatistics()); } ImmutableMap.Builder<Integer, ColumnStatistics> statistics = ImmutableMap.builder(); for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { List<ColumnStatistics> columnStatistics = groupedColumnStatistics.get(rootStructType.getFieldTypeIndex(ordinal)); if (columnStatistics != null) { if (columnStatistics.size() == 1) { statistics.put(ordinal, getOnlyElement(columnStatistics)); } else { // Merge statistics from different streams // This can happen if map is represented as struct (DWRF only) statistics.put(ordinal, mergeColumnStatistics(columnStatistics)); } } } return statistics.build(); }
List<Integer> positionsList = columnIndexes.get(column).get(rowGroupId).getPositions();
@Override public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException { checkState(closed); ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder(); List<LongStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints(); Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints(); for (int i = 0; i < rowGroupColumnStatistics.size(); i++) { int groupId = i; ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId); LongStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId); Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId)); List<Integer> positions = createLongColumnPositionList(compressed, dataCheckpoint, presentCheckpoint); rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics)); } Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build()); Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false); return ImmutableList.of(new StreamDataOutput(slice, stream)); }
private static RowIndexEntry toRowGroupIndex(RowGroupIndex rowGroupIndex) { return RowIndexEntry.newBuilder() .addAllPositions(rowGroupIndex.getPositions().stream() .map(Integer::longValue) .collect(toImmutableList())) .setStatistics(toColumnStatistics(rowGroupIndex.getColumnStatistics())) .build(); }
ColumnStatistics actual = entry.getValue().get(rowGroupIndex).getColumnStatistics(); ColumnStatistics expected = expectedStatistics.get(entry.getKey().getColumn()); validateColumnStatisticsEquivalent(orcDataSourceId, "Row group " + rowGroupIndex + " in stripe at offset " + stripeOffset, actual, expected);
@Override public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException { checkState(closed); ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder(); Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints(); for (int i = 0; i < rowGroupColumnStatistics.size(); i++) { int groupId = i; ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId); Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId)); List<Integer> positions = createStructColumnPositionList(compressed, presentCheckpoint); rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics)); } Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build()); Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false); ImmutableList.Builder<StreamDataOutput> indexStreams = ImmutableList.builder(); indexStreams.add(new StreamDataOutput(slice, stream)); for (ColumnWriter structField : structFields) { indexStreams.addAll(structField.getIndexStreams(metadataWriter)); } return indexStreams.build(); }
long totalRows = 0; for (RowGroupIndex rowGroupIndex : rowGroupIndexes) { ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics(); if (columnStatistics.hasMinAverageValueSizeInBytes()) { totalBytes += columnStatistics.getMinAverageValueSizeInBytes() * columnStatistics.getNumberOfValues();
@Override public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException { checkState(closed); ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder(); List<FloatStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints(); Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints(); for (int i = 0; i < rowGroupColumnStatistics.size(); i++) { int groupId = i; ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId); FloatStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId); Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId)); List<Integer> positions = createFloatColumnPositionList(compressed, dataCheckpoint, presentCheckpoint); rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics)); } Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build()); Stream stream = new Stream(column, StreamKind.ROW_INDEX, slice.length(), false); return ImmutableList.of(new StreamDataOutput(slice, stream)); }
private static RowGroupStatistics buildActualRowGroupStatistics(int rowGroupIndex, Map<StreamId, List<RowGroupIndex>> actualRowGroupStatistics) { return new RowGroupStatistics( BOTH, actualRowGroupStatistics.entrySet() .stream() .collect(Collectors.toMap(entry -> entry.getKey().getColumn(), entry -> entry.getValue().get(rowGroupIndex).getColumnStatistics()))); }