@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
/** * Searches column names by indexes on a given Parquet file schema, and returns its corresponded * Parquet schema types. * * @param schema Message schema where to search for column names. * @param colNames List of column names. * @param colIndexes List of column indexes. * @return A MessageType object of the column names found. */ public static MessageType getSchemaByIndex(MessageType schema, List<String> colNames, List<Integer> colIndexes) { List<Type> schemaTypes = new ArrayList<Type>(); for (Integer i : colIndexes) { if (i < colNames.size()) { if (i < schema.getFieldCount()) { schemaTypes.add(schema.getType(i)); } else { //prefixing with '_mask_' to ensure no conflict with named //columns in the file schema schemaTypes.add( Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i))); } } } return new MessageType(schema.getName(), schemaTypes); }
MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { blocks.add(block); this.totalRowCount += block.getRowCount(); this.fileSchema = footer.getFileMetaData().getSchema(); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, 0); for (int i = 0; i < types.size(); ++i) { columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
String name = fullSchema.getName(); for (Type type : fullSchema.getFields()) { if (tsField.equals(type.getName()) || metricsFields.contains(type.getName()) return new MessageType(name, partialFields);
ParquetMetadata footer = readFooter(config, file, range(0, length)); List<BlockMetaData> blocks = footer.getBlocks(); this.fileSchema = footer.getFileMetaData().getSchema(); Types.MessageTypeBuilder builder = Types.buildMessage(); for (String s: columns) { if (!fileSchema.containsField(s)) { throw new IOException("Can only project existing columns. Unknown field: " + s + " File schema:\n" + fileSchema); builder.addFields(fileSchema.getType(s)); this.reader = new ParquetFileReader( config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : blocks) { this.totalRowCount += block.getRowCount();
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd())); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else { footer = readFooter(configuration, file, NO_FILTER); Set<Long> offsets = new HashSet<>(); for (long offset : rowGroupOffsets) { for (BlockMetaData block : footer.getBlocks()) { if (offsets.contains(block.getStartingPos())) { blocks.add(block); this.fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData(); ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA()); this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : blocks) { this.totalRowCount += block.getRowCount();
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
try { this.operatorContext = context; schema = footer.getFileMetaData().getSchema(); Schema arrowSchema; try { arrowSchema = DremioArrowSchema.fromMetaData(footer.getFileMetaData().getKeyValueMetaData()); } catch (Exception e) { arrowSchema = null; for (ColumnChunkMetaData md : footer.getBlocks().get(rowGroupIndex).getColumns()) { paths.put(md.getPath(), md); fileSystem, filePath, inputStreamProvider); for (String[] path : schema.getPaths()) { Type type = schema.getType(path); if (type.isPrimitive()) { ColumnChunkMetaData md = paths.get(ColumnPath.get(path)); pageReadStore.addColumn(schema.getColumnDescription(path), md);
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException { this.converter = new ParquetMetadataConverter(options); this.file = file; this.f = file.newStream(); this.options = options; this.footer = readFooter(file, options, f, converter); this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath); final List<BlockMetaData> blocks = parquetMetadata.getBlocks(); final FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); null, fileMetaData.getSchema())); schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata() .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount(); final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>(); final long splitStart = ((FileSplit) oldSplit).getStart(); FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema()); if (filter != null) { filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema()); if (filtedBlocks.isEmpty()) { LOG.debug("All row groups are dropped due to filter predicates"); oldSplit.getLocations(), filtedBlocks, readContext.getRequestedSchema().toString(), fileMetaData.getSchema().toString(), fileMetaData.getKeyValueMetaData(), readContext.getReadSupportMetadata());
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
@Test public void testParquetTupleDomainPrimitiveArray() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_array", HiveType.valueOf("array<int>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty()); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER)))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_array", new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element")))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertTrue(tupleDomain.getDomains().get().isEmpty()); }
/** * Map full column paths to all ColumnDescriptors in file schema * * @param footer Parquet file metadata * @return column full path to ColumnDescriptor object map */ public static Map<String, ColumnDescriptor> getColNameToColumnDescriptorMapping(ParquetMetadata footer) { Map<String, ColumnDescriptor> colDescMap = new HashMap<>(); List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns(); for (ColumnDescriptor column : columns) { colDescMap.put(getFullColumnPath(column), column); } return colDescMap; }
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
@Override public ParquetValueWriter<?> primitive(PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); if (primitive.getOriginalType() != null) { switch (primitive.getOriginalType()) { case ENUM: case JSON: return ParquetValueWriters.unboxed(desc); case DECIMAL: DecimalMetadata decimal = primitive.getDecimalMetadata(); switch (primitive.getPrimitiveTypeName()) { case INT32:
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }
"Failed to find related Parquet column descriptor with type " + type); if (fileSchema.getColumns().contains(descriptors.get(0))) { return new VectorizedPrimitiveColumnReader(descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo); List<VectorizedColumnReader> fieldReaders = new ArrayList<>(); List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos(); List<Type> types = type.asGroupType().getFields(); for (int i = 0; i < fieldTypes.size(); i++) { VectorizedColumnReader r = while (groupType.getFieldCount() < 2) { if (nestGroup > MAP_DEFINITION_LEVEL_MAX) { throw new RuntimeException( "Failed to get the field types for Map with type " + type); groupType = groupType.getFields().get(0).asGroupType(); nestGroup++;
private Map<Integer, Converter> buildFieldToConverter(final MessageType schema) { final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount); int i = 0; for (final Type field : schema.getFields()) { if (field.isPrimitive()) { fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), new String[]{field.getName()}, field.getOriginalType())); } else { fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), new String[]{field.getName()})); } i++; } return fieldToConverter; }