requireNonNull(includedColumns, "includedColumns is null"), requireNonNull(predicate, "predicate is null"), footer.getNumberOfRows(), footer.getStripes(), footer.getFileStats(), metadata.getStripeStatsList(), orcDataSource, offset, length, footer.getTypes(), decompressor, footer.getRowsInRowGroup(), requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"), hiveWriterVersion, tinyStripeThreshold, maxBlockSize, footer.getUserMetadata(), systemMemoryUsage, writeValidation,
.collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue()))); Footer footer = new Footer( numberOfRows, rowGroupMaxRowCount,
Footer footer = new OrcReader(orcDataSource, ORC, dataSize, dataSize, dataSize, dataSize).getFooter(); for (StripeInformation stripe : footer.getStripes()) { StripeFooter stripeFooter = ORC.createMetadataReader().readStripeFooter(footer.getTypes(), inputStream);
public List<String> getColumnNames() { return footer.getTypes().get(0).getFieldNames(); }
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold) throws IOException { OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE)); // 1 for reading file footer assertEquals(orcDataSource.getReadCount(), 1); List<StripeInformation> stripes = orcReader.getFooter().getStripes(); // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode. assertGreaterThanOrEqual(stripes.size(), 3); //verify wrapped by CachingOrcReader assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class); OrcRecordReader orcRecordReader = orcReader.createRecordReader( ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); int positionCount = 0; while (true) { int batchSize = orcRecordReader.nextBatch(); if (batchSize <= 0) { break; } Block block = orcRecordReader.readBlock(VARCHAR, 0); positionCount += block.getPositionCount(); } assertEquals(positionCount, POSITION_COUNT); }
@Test public void testReadUserMetadata() throws Exception { try (TempFile tempFile = new TempFile()) { Map<String, String> metadata = ImmutableMap.of( "a", "ala", "b", "ma", "c", "kota"); createFileWithOnlyUserMetadata(tempFile.getFile(), metadata); OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true); OrcReader orcReader = new OrcReader(orcDataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE)); Footer footer = orcReader.getFooter(); Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii); assertEquals(readMetadata, metadata); } }
Footer footer = new OrcReader(orcDataSource, ORC, dataSize, dataSize, dataSize, dataSize).getFooter(); for (StripeInformation stripe : footer.getStripes()) { StripeFooter stripeFooter = ORC.createMetadataReader().readStripeFooter(footer.getTypes(), inputStream);
public List<String> getColumnNames() { return footer.getTypes().get(0).getFieldNames(); }
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold) throws IOException { OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE)); // 1 for reading file footer assertEquals(orcDataSource.getReadCount(), 1); List<StripeInformation> stripes = orcReader.getFooter().getStripes(); // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode. assertGreaterThanOrEqual(stripes.size(), 3); //verify wrapped by CachingOrcReader assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class); OrcRecordReader orcRecordReader = orcReader.createRecordReader( ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); int positionCount = 0; while (true) { int batchSize = orcRecordReader.nextBatch(); if (batchSize <= 0) { break; } Block block = orcRecordReader.readBlock(VARCHAR, 0); positionCount += block.getPositionCount(); } assertEquals(positionCount, POSITION_COUNT); }
private static Optional<OrcFileMetadata> getOrcFileMetadata(OrcReader reader) { return Optional.ofNullable(reader.getFooter().getUserMetadata().get(OrcFileMetadata.KEY)) .map(slice -> METADATA_CODEC.fromJson(slice.getBytes())); }
@Override public int writeFooter(SliceOutput output, Footer footer) throws IOException { OrcProto.Footer footerProtobuf = OrcProto.Footer.newBuilder() .setWriter(PRESTO_WRITER_ID) .setNumberOfRows(footer.getNumberOfRows()) .setRowIndexStride(footer.getRowsInRowGroup()) .addAllStripes(footer.getStripes().stream() .map(OrcMetadataWriter::toStripeInformation) .collect(toList())) .addAllTypes(footer.getTypes().stream() .map(OrcMetadataWriter::toType) .collect(toList())) .addAllStatistics(footer.getFileStats().stream() .map(OrcMetadataWriter::toColumnStatistics) .collect(toList())) .addAllMetadata(footer.getUserMetadata().entrySet().stream() .map(OrcMetadataWriter::toUserMetadata) .collect(toList())) .build(); return writeProtobufObject(output, footerProtobuf); }
public List<String> getColumnNames() { return footer.getTypes().get(0).getFieldNames(); }
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize) throws IOException { OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), maxMergeDistance, maxReadSize); // 1 for reading file footer assertEquals(orcDataSource.getReadCount(), 1); List<StripeInformation> stripes = orcReader.getFooter().getStripes(); // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode. assertGreaterThanOrEqual(stripes.size(), 3); //verify wrapped by CachingOrcReader assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, maxReadSize), CachingOrcDataSource.class); OrcRecordReader orcRecordReader = orcReader.createRecordReader( ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, new AggregatedMemoryContext()); int positionCount = 0; while (true) { int batchSize = orcRecordReader.nextBatch(); if (batchSize <= 0) { break; } Block block = orcRecordReader.readBlock(VARCHAR, 0); positionCount += block.getPositionCount(); } assertEquals(positionCount, POSITION_COUNT); }
@Override public Footer readFooter(HiveWriterVersion hiveWriterVersion, InputStream inputStream) throws IOException { CodedInputStream input = CodedInputStream.newInstance(inputStream); DwrfProto.Footer footer = DwrfProto.Footer.parseFrom(input); // todo enable file stats when DWRF team verifies that the stats are correct // List<ColumnStatistics> fileStats = toColumnStatistics(hiveWriterVersion, footer.getStatisticsList(), false); List<ColumnStatistics> fileStats = ImmutableList.of(); return new Footer( footer.getNumberOfRows(), footer.getRowIndexStride(), toStripeInformation(footer.getStripesList()), toType(footer.getTypesList()), fileStats, toUserMetadata(footer.getMetadataList())); }
@Test public void testReadUserMetadata() throws Exception { try (TempFile tempFile = new TempFile()) { Map<String, String> metadata = ImmutableMap.of( "a", "ala", "b", "ma", "c", "kota"); createFileWithOnlyUserMetadata(tempFile.getFile(), metadata); OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true); OrcReader orcReader = new OrcReader(orcDataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE)); Footer footer = orcReader.getFooter(); Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii); assertEquals(readMetadata, metadata); } }
@Override public int writeFooter(SliceOutput output, Footer footer) throws IOException { DwrfProto.Footer footerProtobuf = DwrfProto.Footer.newBuilder() .setNumberOfRows(footer.getNumberOfRows()) .setRowIndexStride(footer.getRowsInRowGroup()) .addAllStripes(footer.getStripes().stream() .map(DwrfMetadataWriter::toStripeInformation) .collect(toImmutableList())) .addAllTypes(footer.getTypes().stream() .map(DwrfMetadataWriter::toType) .collect(toImmutableList())) .addAllStatistics(footer.getFileStats().stream() .map(DwrfMetadataWriter::toColumnStatistics) .collect(toImmutableList())) .addAllMetadata(footer.getUserMetadata().entrySet().stream() .map(DwrfMetadataWriter::toUserMetadata) .collect(toImmutableList())) .addAllMetadata(STATIC_METADATA.entrySet().stream() .map(DwrfMetadataWriter::toUserMetadata) .collect(toImmutableList())) .build(); return writeProtobufObject(output, footerProtobuf); }
private List<ColumnInfo> getColumnInfo(OrcReader reader) { Optional<OrcFileMetadata> metadata = getOrcFileMetadata(reader); if (metadata.isPresent()) { return getColumnInfoFromOrcUserMetadata(metadata.get()); } // support for legacy files without metadata return getColumnInfoFromOrcColumnTypes(reader.getColumnNames(), reader.getFooter().getTypes()); }
@Override public Footer readFooter(HiveWriterVersion hiveWriterVersion, InputStream inputStream) throws IOException { CodedInputStream input = CodedInputStream.newInstance(inputStream); input.setSizeLimit(PROTOBUF_MESSAGE_MAX_LIMIT); OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); return new Footer( footer.getNumberOfRows(), footer.getRowIndexStride(), toStripeInformation(footer.getStripesList()), toType(footer.getTypesList()), toColumnStatistics(hiveWriterVersion, footer.getStatisticsList(), false), toUserMetadata(footer.getMetadataList())); }
this.footer = metadataReader.readFooter(hiveWriterVersion, footerInputStream); if (footer.getTypes().size() == 0) { throw new OrcCorruptionException(orcDataSource.getId(), "File has no columns"); validateWrite(validation -> validation.getRowGroupMaxRowCount() == footer.getRowsInRowGroup(), "Unexpected rows in group"); if (writeValidation.isPresent()) { writeValidation.get().validateMetadata(orcDataSource.getId(), footer.getUserMetadata()); writeValidation.get().validateFileStatistics(orcDataSource.getId(), footer.getFileStats()); writeValidation.get().validateStripeStatistics(orcDataSource.getId(), footer.getStripes(), metadata.getStripeStatsList());
.collect(Collectors.toMap(Entry::getKey, entry -> utf8Slice(entry.getValue()))); Footer footer = new Footer( numberOfRows, rowGroupMaxRowCount,