private Map<StreamId, ValueInputStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, ValueInputStream<?>> valueStreams = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()) .getColumnEncoding(stream.getSequence()) .getColumnEncodingKind(); // skip index and empty streams if (isIndexStream(stream) || stream.getLength() == 0) { continue; } OrcInputStream inputStream = streamsData.get(streamId); OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); } return valueStreams.build(); }
stripeReader = new StripeReader( orcDataSource, decompressor,
private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) { int rowsInStripe = toIntExact(stripe.getNumberOfRows()); int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder(); int remainingRows = rowsInStripe; for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { int rows = Math.min(remainingRows, rowsInRowGroup); Map<Integer, ColumnStatistics> statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); if (predicate.matches(rows, statistics)) { selectedRowGroups.add(rowGroup); } remainingRows -= rows; } return selectedRowGroups.build(); }
StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage); List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings(); Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams()); diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage); Map<StreamId, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData); Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes); if (writeValidation.isPresent()) { writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes); Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes); Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings); InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); List<RowGroup> rowGroups = createRowGroups( stripe.getNumberOfRows(), streams, for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) { StreamId streamId = entry.getKey(); if (streams.keySet().contains(streamId)) { Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
public StripeReader(OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor, List<OrcType> types, Set<Integer> includedColumns, int rowsInRowGroup, OrcPredicate predicate, HiveWriterVersion hiveWriterVersion, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation) { this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null"); this.decompressor = requireNonNull(decompressor, "decompressor is null"); this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); this.includedOrcColumns = getIncludedOrcColumns(types, requireNonNull(includedColumns, "includedColumns is null")); this.rowsInRowGroup = rowsInRowGroup; this.predicate = requireNonNull(predicate, "predicate is null"); this.hiveWriterVersion = requireNonNull(hiveWriterVersion, "hiveWriterVersion is null"); this.metadataReader = requireNonNull(metadataReader, "metadataReader is null"); this.writeValidation = requireNonNull(writeValidation, "writeValidation is null"); }
private List<RowGroup> createRowGroups( int rowsInStripe, Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, List<RowGroupIndex>> columnIndexes, Set<Integer> selectedRowGroups, List<ColumnEncoding> encodings) throws InvalidCheckpointException { ImmutableList.Builder<RowGroup> rowGroupBuilder = ImmutableList.builder(); for (int rowGroupId : selectedRowGroups) { Map<StreamId, StreamCheckpoint> checkpoints = getStreamCheckpoints(includedOrcColumns, types, decompressor.isPresent(), rowGroupId, encodings, streams, columnIndexes); int rowOffset = rowGroupId * rowsInRowGroup; int rowsInGroup = Math.min(rowsInStripe - rowOffset, rowsInRowGroup); long minAverageRowBytes = columnIndexes .entrySet() .stream() .mapToLong(e -> e.getValue() .get(rowGroupId) .getColumnStatistics() .getMinAverageValueSizeInBytes()) .sum(); rowGroupBuilder.add(createRowGroup(rowGroupId, rowOffset, rowsInGroup, minAverageRowBytes, valueStreams, checkpoints)); } return rowGroupBuilder.build(); }
StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage); List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings(); Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams()); diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage); Map<Integer, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData); Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes); Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings); StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); List<RowGroup> rowGroups = createRowGroups( stripe.getNumberOfRows(), streams, for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) { StreamId streamId = entry.getKey(); if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) { Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage); Map<StreamId, ValueStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings); StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings);
public StripeReader(OrcDataSource orcDataSource, CompressionKind compressionKind, List<OrcType> types, int bufferSize, Set<Integer> includedColumns, int rowsInRowGroup, OrcPredicate predicate, MetadataReader metadataReader) { this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null"); this.compressionKind = requireNonNull(compressionKind, "compressionKind is null"); this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); this.bufferSize = bufferSize; this.includedOrcColumns = getIncludedOrcColumns(types, requireNonNull(includedColumns, "includedColumns is null")); this.rowsInRowGroup = rowsInRowGroup; this.predicate = requireNonNull(predicate, "predicate is null"); this.metadataReader = requireNonNull(metadataReader, "metadataReader is null"); }
private List<RowGroup> createRowGroups( int rowsInStripe, Map<StreamId, Stream> streams, Map<StreamId, ValueStream<?>> valueStreams, Map<Integer, List<RowGroupIndex>> columnIndexes, Set<Integer> selectedRowGroups, List<ColumnEncoding> encodings) throws InvalidCheckpointException { ImmutableList.Builder<RowGroup> rowGroupBuilder = ImmutableList.builder(); for (int rowGroupId : selectedRowGroups) { Map<StreamId, StreamCheckpoint> checkpoints = getStreamCheckpoints(includedOrcColumns, types, compressionKind, rowGroupId, encodings, streams, columnIndexes); int rowOffset = rowGroupId * rowsInRowGroup; int rowsInGroup = Math.min(rowsInStripe - rowOffset, rowsInRowGroup); rowGroupBuilder.add(createRowGroup(rowGroupId, rowOffset, rowsInGroup, valueStreams, checkpoints)); } return rowGroupBuilder.build(); }
StripeFooter stripeFooter = readStripeFooter(stripe, systemMemoryUsage); List<ColumnEncoding> columnEncodings = stripeFooter.getColumnEncodings(); Map<StreamId, DiskRange> diskRanges = getDiskRanges(stripeFooter.getStreams()); diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage); Map<StreamId, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData); Map<StreamId, List<RowGroupIndex>> columnIndexes = readColumnIndexes(streams, streamsData, bloomFilterIndexes); if (writeValidation.isPresent()) { writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), stripe.getOffset(), columnIndexes); Set<Integer> selectedRowGroups = selectRowGroups(stripe, columnIndexes); Map<StreamId, ValueInputStream<?>> valueStreams = createValueStreams(streams, streamsData, columnEncodings); InputStreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); List<RowGroup> rowGroups = createRowGroups( stripe.getNumberOfRows(), streams, for (Entry<StreamId, DiskRange> entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) { StreamId streamId = entry.getKey(); if (streams.keySet().contains(streamId)) { Map<StreamId, OrcInputStream> streamsData = readDiskRanges(stripe.getOffset(), diskRanges, systemMemoryUsage);
private Set<Integer> selectRowGroups(StripeInformation stripe, Map<StreamId, List<RowGroupIndex>> columnIndexes) { int rowsInStripe = toIntExact(stripe.getNumberOfRows()); int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder(); int remainingRows = rowsInStripe; for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { int rows = Math.min(remainingRows, rowsInRowGroup); Map<Integer, ColumnStatistics> statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); if (predicate.matches(rows, statistics)) { selectedRowGroups.add(rowGroup); } remainingRows -= rows; } return selectedRowGroups.build(); }
boolean dataStreamStarted = false; for (Stream stream : stripeFooter.getStreams()) { if (isIndexStream(stream)) { assertFalse(dataStreamStarted); continue;
public StripeReader(OrcDataSource orcDataSource, Optional<OrcDecompressor> decompressor, List<OrcType> types, Set<Integer> includedColumns, int rowsInRowGroup, OrcPredicate predicate, HiveWriterVersion hiveWriterVersion, MetadataReader metadataReader, Optional<OrcWriteValidation> writeValidation) { this.orcDataSource = requireNonNull(orcDataSource, "orcDataSource is null"); this.decompressor = requireNonNull(decompressor, "decompressor is null"); this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); this.includedOrcColumns = getIncludedOrcColumns(types, requireNonNull(includedColumns, "includedColumns is null")); this.rowsInRowGroup = rowsInRowGroup; this.predicate = requireNonNull(predicate, "predicate is null"); this.hiveWriterVersion = requireNonNull(hiveWriterVersion, "hiveWriterVersion is null"); this.metadataReader = requireNonNull(metadataReader, "metadataReader is null"); this.writeValidation = requireNonNull(writeValidation, "writeValidation is null"); }
private List<RowGroup> createRowGroups( int rowsInStripe, Map<StreamId, Stream> streams, Map<StreamId, ValueInputStream<?>> valueStreams, Map<StreamId, List<RowGroupIndex>> columnIndexes, Set<Integer> selectedRowGroups, List<ColumnEncoding> encodings) throws InvalidCheckpointException { ImmutableList.Builder<RowGroup> rowGroupBuilder = ImmutableList.builder(); for (int rowGroupId : selectedRowGroups) { Map<StreamId, StreamCheckpoint> checkpoints = getStreamCheckpoints(includedOrcColumns, types, decompressor.isPresent(), rowGroupId, encodings, streams, columnIndexes); int rowOffset = rowGroupId * rowsInRowGroup; int rowsInGroup = Math.min(rowsInStripe - rowOffset, rowsInRowGroup); long minAverageRowBytes = columnIndexes .entrySet() .stream() .mapToLong(e -> e.getValue() .get(rowGroupId) .getColumnStatistics() .getMinAverageValueSizeInBytes()) .sum(); rowGroupBuilder.add(createRowGroup(rowGroupId, rowOffset, rowsInGroup, minAverageRowBytes, valueStreams, checkpoints)); } return rowGroupBuilder.build(); }
this.currentStripeSystemMemoryContext = systemMemoryUsage.newAggregatedMemoryContext(); stripeReader = new StripeReader( orcDataSource, compressionKind,
private Set<Integer> selectRowGroups(StripeInformation stripe, Map<Integer, List<RowGroupIndex>> columnIndexes) throws IOException { int rowsInStripe = Ints.checkedCast(stripe.getNumberOfRows()); int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); ImmutableSet.Builder<Integer> selectedRowGroups = ImmutableSet.builder(); int remainingRows = rowsInStripe; for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { int rows = Math.min(remainingRows, rowsInRowGroup); Map<Integer, ColumnStatistics> statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); if (predicate.matches(rows, statistics)) { selectedRowGroups.add(rowGroup); } remainingRows -= rows; } return selectedRowGroups.build(); }
private Map<StreamId, ValueStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, ValueStream<?>> valueStreams = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); // skip index and empty streams if (isIndexStream(stream) || stream.getLength() == 0) { continue; } OrcInputStream inputStream = streamsData.get(streamId); OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); } return valueStreams.build(); }
stripeReader = new StripeReader( orcDataSource, decompressor,
private Map<StreamId, ValueInputStream<?>> createValueStreams(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, List<ColumnEncoding> columnEncodings) { ImmutableMap.Builder<StreamId, ValueInputStream<?>> valueStreams = ImmutableMap.builder(); for (Entry<StreamId, Stream> entry : streams.entrySet()) { StreamId streamId = entry.getKey(); Stream stream = entry.getValue(); ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()) .getColumnEncoding(stream.getSequence()) .getColumnEncodingKind(); // skip index and empty streams if (isIndexStream(stream) || stream.getLength() == 0) { continue; } OrcInputStream inputStream = streamsData.get(streamId); OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); } return valueStreams.build(); }
boolean dataStreamStarted = false; for (Stream stream : stripeFooter.getStreams()) { if (isIndexStream(stream)) { assertFalse(dataStreamStarted); continue;