public TempFileReader(List<Type> types, OrcDataSource dataSource) { this.types = ImmutableList.copyOf(requireNonNull(types, "types is null")); try { OrcReader orcReader = new OrcReader( dataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(16, MEGABYTE)); Map<Integer, Type> includedColumns = new HashMap<>(); for (int i = 0; i < types.size(); i++) { includedColumns.put(i, types.get(i)); } reader = orcReader.createRecordReader( includedColumns, OrcPredicate.TRUE, UTC, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); } catch (IOException e) { throw new PrestoException(HIVE_WRITER_DATA_ERROR, "Failed to read temporary data"); } }
OrcReader reader = new OrcReader(dataSource, ORC, readerAttributes.getMaxMergeDistance(), readerAttributes.getMaxReadSize(), readerAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE); Map<Long, Integer> indexMap = columnIdIndex(reader.getColumnNames()); ImmutableMap.Builder<Integer, Type> includedColumns = ImmutableMap.builder(); ImmutableList.Builder<Integer> columnIndexes = ImmutableList.builder(); OrcRecordReader recordReader = reader.createRecordReader(includedColumns.build(), predicate, UTC, systemMemoryUsage, INITIAL_BATCH_SIZE);
private List<ColumnInfo> getColumnInfo(OrcReader reader) { Optional<OrcFileMetadata> metadata = getOrcFileMetadata(reader); if (metadata.isPresent()) { return getColumnInfoFromOrcUserMetadata(metadata.get()); } // support for legacy files without metadata return getColumnInfoFromOrcColumnTypes(reader.getColumnNames(), reader.getFooter().getTypes()); }
private static ColumnStats doComputeColumnStats(OrcReader orcReader, long columnId, Type type) throws IOException { int columnIndex = columnIndex(orcReader.getColumnNames(), columnId); OrcRecordReader reader = orcReader.createRecordReader(ImmutableMap.of(columnIndex, type), OrcPredicate.TRUE, UTC, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); if (type.equals(BooleanType.BOOLEAN)) { return indexBoolean(type, reader, columnIndex, columnId); } if (type.equals(BigintType.BIGINT) || type.equals(DateType.DATE) || type.equals(TimestampType.TIMESTAMP)) { return indexLong(type, reader, columnIndex, columnId); } if (type.equals(DoubleType.DOUBLE)) { return indexDouble(type, reader, columnIndex, columnId); } if (type instanceof VarcharType) { return indexString(type, reader, columnIndex, columnId); } return null; }
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold) throws IOException { OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE)); // 1 for reading file footer assertEquals(orcDataSource.getReadCount(), 1); List<StripeInformation> stripes = orcReader.getFooter().getStripes(); // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode. assertGreaterThanOrEqual(stripes.size(), 3); //verify wrapped by CachingOrcReader assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class); OrcRecordReader orcRecordReader = orcReader.createRecordReader( ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); int positionCount = 0; while (true) { int batchSize = orcRecordReader.nextBatch(); if (batchSize <= 0) { break; } Block block = orcRecordReader.readBlock(VARCHAR, 0); positionCount += block.getPositionCount(); } assertEquals(positionCount, POSITION_COUNT); }
@Test public void testReadUserMetadata() throws Exception { try (TempFile tempFile = new TempFile()) { Map<String, String> metadata = ImmutableMap.of( "a", "ala", "b", "ma", "c", "kota"); createFileWithOnlyUserMetadata(tempFile.getFile(), metadata); OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true); OrcReader orcReader = new OrcReader(orcDataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE)); Footer footer = orcReader.getFooter(); Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii); assertEquals(readMetadata, metadata); } }
public static OrcRecordReader createReader(OrcDataSource dataSource, List<Long> columnIds, List<Type> types) throws IOException { OrcReader orcReader = new OrcReader(dataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE)); List<String> columnNames = orcReader.getColumnNames(); assertEquals(columnNames.size(), columnIds.size()); Map<Integer, Type> includedColumns = new HashMap<>(); int ordinal = 0; for (long columnId : columnIds) { assertEquals(columnNames.get(ordinal), String.valueOf(columnId)); includedColumns.put(ordinal, types.get(ordinal)); ordinal++; } return createRecordReader(orcReader, includedColumns); }
throws IOException orcDataSource = wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold); this.orcDataSource = orcDataSource; requireNonNull(orcEncoding, "orcEncoding is null"); if (!isValidHeaderMagic(orcDataSource)) { throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file"); checkOrcVersion(orcDataSource, postScript.getVersion()); validateWrite(validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version"); validateWrite(validation -> validation.getCompression() == compressionKind, "Unexpected compression"); validateWrite(validation -> validation.getColumnNames().equals(getColumnNames()), "Unexpected column names"); validateWrite(validation -> validation.getRowGroupMaxRowCount() == footer.getRowsInRowGroup(), "Unexpected rows in group"); if (writeValidation.isPresent()) { writeValidation.get().validateMetadata(orcDataSource.getId(), footer.getUserMetadata());
public static OrcRecordReader createRecordReader(OrcReader orcReader, Map<Integer, Type> includedColumns) { return orcReader.createRecordReader(includedColumns, OrcPredicate.TRUE, DateTimeZone.UTC, newSimpleAggregatedMemoryContext(), MAX_BATCH_SIZE); }
private List<ColumnStats> computeShardStats(File file) { try (OrcDataSource dataSource = fileOrcDataSource(defaultReaderAttributes, file)) { OrcReader reader = new OrcReader(dataSource, ORC, defaultReaderAttributes.getMaxMergeDistance(), defaultReaderAttributes.getMaxReadSize(), defaultReaderAttributes.getTinyStripeThreshold(), HUGE_MAX_READ_BLOCK_SIZE); ImmutableList.Builder<ColumnStats> list = ImmutableList.builder(); for (ColumnInfo info : getColumnInfo(reader)) { computeColumnStats(reader, info.getColumnId(), info.getType()).ifPresent(list::add); } return list.build(); } catch (IOException e) { throw new PrestoException(RAPTOR_ERROR, "Failed to read file: " + file, e); } }
private static Map<String, Integer> buildPhysicalNameOrdinalMap(OrcReader reader) { ImmutableMap.Builder<String, Integer> physicalNameOrdinalMap = ImmutableMap.builder(); int ordinal = 0; for (String physicalColumnName : reader.getColumnNames()) { physicalNameOrdinalMap.put(physicalColumnName, ordinal); ordinal++; } return physicalNameOrdinalMap.build(); } }
private static Optional<OrcFileMetadata> getOrcFileMetadata(OrcReader reader) { return Optional.ofNullable(reader.getFooter().getUserMetadata().get(OrcFileMetadata.KEY)) .map(slice -> METADATA_CODEC.fromJson(slice.getBytes())); }
public void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize maxReadSize, DataSize tinyStripeThreshold) throws IOException { OrcReader orcReader = new OrcReader(orcDataSource, ORC, maxMergeDistance, maxReadSize, tinyStripeThreshold, new DataSize(1, Unit.MEGABYTE)); // 1 for reading file footer assertEquals(orcDataSource.getReadCount(), 1); List<StripeInformation> stripes = orcReader.getFooter().getStripes(); // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode. assertGreaterThanOrEqual(stripes.size(), 3); //verify wrapped by CachingOrcReader assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class); OrcRecordReader orcRecordReader = orcReader.createRecordReader( ImmutableMap.of(0, VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE); int positionCount = 0; while (true) { int batchSize = orcRecordReader.nextBatch(); if (batchSize <= 0) { break; } Block block = orcRecordReader.readBlock(VARCHAR, 0); positionCount += block.getPositionCount(); } assertEquals(positionCount, POSITION_COUNT); }
@Test public void testReadUserMetadata() throws Exception { try (TempFile tempFile = new TempFile()) { Map<String, String> metadata = ImmutableMap.of( "a", "ala", "b", "ma", "c", "kota"); createFileWithOnlyUserMetadata(tempFile.getFile(), metadata); OrcDataSource orcDataSource = new FileOrcDataSource(tempFile.getFile(), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), true); OrcReader orcReader = new OrcReader(orcDataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE)); Footer footer = orcReader.getFooter(); Map<String, String> readMetadata = Maps.transformValues(footer.getUserMetadata(), Slice::toStringAscii); assertEquals(readMetadata, metadata); } }
public static OrcRecordReader createReaderNoRows(OrcDataSource dataSource) throws IOException { OrcReader orcReader = new OrcReader(dataSource, ORC, new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE), new DataSize(1, MEGABYTE)); assertEquals(orcReader.getColumnNames().size(), 0); return createRecordReader(orcReader, ImmutableMap.of()); }
throws IOException orcDataSource = wrapWithCacheIfTiny(orcDataSource, tinyStripeThreshold); this.orcDataSource = orcDataSource; requireNonNull(orcEncoding, "orcEncoding is null"); if (!isValidHeaderMagic(orcDataSource)) { throw new OrcCorruptionException(orcDataSource.getId(), "Not an ORC file"); checkOrcVersion(orcDataSource, postScript.getVersion()); validateWrite(validation -> validation.getVersion().equals(postScript.getVersion()), "Unexpected version"); validateWrite(validation -> validation.getCompression() == compressionKind, "Unexpected compression"); validateWrite(validation -> validation.getColumnNames().equals(getColumnNames()), "Unexpected column names"); validateWrite(validation -> validation.getRowGroupMaxRowCount() == footer.getRowsInRowGroup(), "Unexpected rows in group"); if (writeValidation.isPresent()) { writeValidation.get().validateMetadata(orcDataSource.getId(), footer.getUserMetadata());
public OrcRecordReader createRecordReader( Map<Integer, Type> includedColumns, OrcPredicate predicate, long offset, long length, DateTimeZone hiveStorageTimeZone, AggregatedMemoryContext systemMemoryUsage, int initialBatchSize) { return new OrcRecordReader( requireNonNull(includedColumns, "includedColumns is null"), requireNonNull(predicate, "predicate is null"), footer.getNumberOfRows(), footer.getStripes(), footer.getFileStats(), metadata.getStripeStatsList(), orcDataSource, offset, length, footer.getTypes(), decompressor, footer.getRowsInRowGroup(), requireNonNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"), hiveWriterVersion, metadataReader, maxMergeDistance, tinyStripeThreshold, maxBlockSize, footer.getUserMetadata(), systemMemoryUsage,
private static List<HiveColumnHandle> getPhysicalHiveColumnHandles(List<HiveColumnHandle> columns, boolean useOrcColumnNames, OrcReader reader, Path path) { if (!useOrcColumnNames) { return columns; } verifyFileHasColumnNames(reader.getColumnNames(), path); Map<String, Integer> physicalNameOrdinalMap = buildPhysicalNameOrdinalMap(reader); int nextMissingColumnIndex = physicalNameOrdinalMap.size(); ImmutableList.Builder<HiveColumnHandle> physicalColumns = ImmutableList.builder(); for (HiveColumnHandle column : columns) { Integer physicalOrdinal = physicalNameOrdinalMap.get(column.getName()); if (physicalOrdinal == null) { // if the column is missing from the file, assign it a column number larger // than the number of columns in the file so the reader will fill it with nulls physicalOrdinal = nextMissingColumnIndex; nextMissingColumnIndex++; } physicalColumns.add(new HiveColumnHandle(column.getName(), column.getHiveType(), column.getTypeSignature(), physicalOrdinal, column.getColumnType(), column.getComment())); } return physicalColumns.build(); }
static void validateFile( OrcWriteValidation writeValidation, OrcDataSource input, List<Type> types, DateTimeZone hiveStorageTimeZone, OrcEncoding orcEncoding) throws OrcCorruptionException { ImmutableMap.Builder<Integer, Type> readTypes = ImmutableMap.builder(); for (int columnIndex = 0; columnIndex < types.size(); columnIndex++) { readTypes.put(columnIndex, types.get(columnIndex)); } try { OrcReader orcReader = new OrcReader(input, orcEncoding, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(16, MEGABYTE), Optional.of(writeValidation)); try (OrcRecordReader orcRecordReader = orcReader.createRecordReader(readTypes.build(), OrcPredicate.TRUE, hiveStorageTimeZone, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE)) { while (orcRecordReader.nextBatch() >= 0) { // ignored } } } catch (IOException e) { throw new OrcCorruptionException(e, input.getId(), "Validation failed"); } } }