org.apache.parquet.schema.MessageType java code examples

Refine search

 @Test
 public void testMapOriginalType() throws Exception {
  final String hiveColumnTypes = "map<string,string>";
  final String hiveColumnNames = "mapCol";
  final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames);
  final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes);
  final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
  // this messageType only has one optional field, whose name is mapCol, original Type is MAP
  assertEquals(1, messageTypeFound.getFieldCount());
  org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0);
  assertEquals("mapCol",topLevel.getName());
  assertEquals(OriginalType.MAP, topLevel.getOriginalType());
  assertEquals(Repetition.OPTIONAL, topLevel.getRepetition());

  assertEquals(1, topLevel.asGroupType().getFieldCount());
  org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0);
  //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE;
  assertEquals("map", secondLevel.getName());
  assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType());
  assertEquals(Repetition.REPEATED, secondLevel.getRepetition());
 }
}

/**
 * Searches column names by indexes on a given Parquet file schema, and returns its corresponded
 * Parquet schema types.
 *
 * @param schema Message schema where to search for column names.
 * @param colNames List of column names.
 * @param colIndexes List of column indexes.
 * @return A MessageType object of the column names found.
 */
public static MessageType getSchemaByIndex(MessageType schema, List<String> colNames, List<Integer> colIndexes) {
 List<Type> schemaTypes = new ArrayList<Type>();
 for (Integer i : colIndexes) {
  if (i < colNames.size()) {
   if (i < schema.getFieldCount()) {
    schemaTypes.add(schema.getType(i));
   } else {
    //prefixing with '_mask_' to ensure no conflict with named
    //columns in the file schema
    schemaTypes.add(
     Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
   }
  }
 }
 return new MessageType(schema.getName(), schemaTypes);
}

 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
 blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
 for (BlockMetaData block : footer.getBlocks()) {
  if (offsets.contains(block.getStartingPos())) {
   blocks.add(block);
 this.totalRowCount += block.getRowCount();
this.fileSchema = footer.getFileMetaData().getSchema();
this.reader = new ParquetFileReader(
 configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

 return;
PageReadStore pages = reader.readNextRowGroup();
if (pages == null) {
 throw new IOException("expecting more rows but reached last block. Read "
  + rowsReturned + " out of " + totalRowCount);
List<ColumnDescriptor> columns = requestedSchema.getColumns();
List<Type> types = requestedSchema.getFields();
columnReaders = new VectorizedColumnReader[columns.size()];
   columnReaders[i] =
     buildVectorizedParquetReader(columnTypesList.get(colsToInclude.get(i)), types.get(i),
       pages, requestedSchema.getColumns(), skipTimestampConversion, 0);
 for (int i = 0; i < types.size(); ++i) {
  columnReaders[i] = buildVectorizedParquetReader(columnTypesList.get(i), types.get(i), pages,
   requestedSchema.getColumns(), skipTimestampConversion, 0);

String name = fullSchema.getName();
for (Type type : fullSchema.getFields()) {
 if (tsField.equals(type.getName())
   || metricsFields.contains(type.getName())
return new MessageType(name, partialFields);

ParquetMetadata footer = readFooter(config, file, range(0, length));
List<BlockMetaData> blocks = footer.getBlocks();
this.fileSchema = footer.getFileMetaData().getSchema();
  Types.MessageTypeBuilder builder = Types.buildMessage();
  for (String s: columns) {
   if (!fileSchema.containsField(s)) {
    throw new IOException("Can only project existing columns. Unknown field: " + s +
        " File schema:\n" + fileSchema);
   builder.addFields(fileSchema.getType(s));
this.reader = new ParquetFileReader(
  config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
for (BlockMetaData block : blocks) {
 this.totalRowCount += block.getRowCount();

 footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
 blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
 footer = readFooter(configuration, file, NO_FILTER);
 Set<Long> offsets = new HashSet<>();
 for (long offset : rowGroupOffsets) {
 for (BlockMetaData block : footer.getBlocks()) {
  if (offsets.contains(block.getStartingPos())) {
   blocks.add(block);
this.fileSchema = footer.getFileMetaData().getSchema();
Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA());
this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);
this.reader = new ParquetFileReader(
  configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
for (BlockMetaData block : blocks) {
 this.totalRowCount += block.getRowCount();

private void initializeInternal() throws IOException, UnsupportedOperationException {
 // Check that the requested schema is supported.
 missingColumns = new boolean[requestedSchema.getFieldCount()];
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 List<String[]> paths = requestedSchema.getPaths();
 for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
  Type t = requestedSchema.getFields().get(i);
  if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
   throw new UnsupportedOperationException("Complex types not supported.");
  }
  String[] colPath = paths.get(i);
  if (fileSchema.containsPath(colPath)) {
   ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
   if (!fd.equals(columns.get(i))) {
    throw new UnsupportedOperationException("Schema evolution not supported.");
   }
   missingColumns[i] = false;
  } else {
   if (columns.get(i).getMaxDefinitionLevel() == 0) {
    // Column is missing in data but the required data is non-nullable. This file is invalid.
    throw new IOException("Required column is missing in data file. Col: " +
     Arrays.toString(colPath));
   }
   missingColumns[i] = true;
  }
 }
}

try {
 this.operatorContext = context;
 schema = footer.getFileMetaData().getSchema();
 Schema arrowSchema;
 try {
  arrowSchema = DremioArrowSchema.fromMetaData(footer.getFileMetaData().getKeyValueMetaData());
 } catch (Exception e) {
  arrowSchema = null;
 for (ColumnChunkMetaData md : footer.getBlocks().get(rowGroupIndex).getColumns()) {
  paths.put(md.getPath(), md);
      fileSystem, filePath, inputStreamProvider);
  for (String[] path : schema.getPaths()) {
   Type type = schema.getType(path);
   if (type.isPrimitive()) {
    ColumnChunkMetaData md = paths.get(ColumnPath.get(path));
    pageReadStore.addColumn(schema.getColumnDescription(path), md);

public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
 this.converter = new ParquetMetadataConverter(options);
 this.file = file;
 this.f = file.newStream();
 this.options = options;
 this.footer = readFooter(file, options, f, converter);
 this.fileMetaData = footer.getFileMetaData();
 this.blocks = filterRowGroups(footer.getBlocks());
 for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
  paths.put(ColumnPath.get(col.getPath()), col);
 }
}

final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
 null, fileMetaData.getSchema()));
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata()
 .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
final long splitStart = ((FileSplit) oldSplit).getStart();
FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
if (filter != null) {
 filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
 if (filtedBlocks.isEmpty()) {
  LOG.debug("All row groups are dropped due to filter predicates");
 oldSplit.getLocations(),
 filtedBlocks,
 readContext.getRequestedSchema().toString(),
 fileMetaData.getSchema().toString(),
 fileMetaData.getKeyValueMetaData(),
 readContext.getReadSupportMetadata());

private static void add(ParquetMetadata footer) {
 for (BlockMetaData blockMetaData : footer.getBlocks()) {
  ++ blockCount;
  MessageType schema = footer.getFileMetaData().getSchema();
  recordCount += blockMetaData.getRowCount();
  List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
  for (ColumnChunkMetaData columnMetaData : columns) {
   ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
   add(
     desc,
     columnMetaData.getValueCount(),
     columnMetaData.getTotalSize(),
     columnMetaData.getTotalUncompressedSize(),
     columnMetaData.getEncodings(),
     columnMetaData.getStatistics());
  }
 }
}

@Test
public void testParquetTupleDomainPrimitiveArray()
{
  HiveColumnHandle columnHandle = new HiveColumnHandle("my_array", HiveType.valueOf("array<int>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty());
  TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER))));
  MessageType fileSchema = new MessageType("hive_schema",
      new GroupType(OPTIONAL, "my_array",
          new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element"))));
  Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
  TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain);
  assertTrue(tupleDomain.getDomains().get().isEmpty());
}

/**
 * Map full column paths to all ColumnDescriptors in file schema
 *
 * @param footer Parquet file metadata
 * @return       column full path to ColumnDescriptor object map
 */
public static Map<String, ColumnDescriptor> getColNameToColumnDescriptorMapping(ParquetMetadata footer) {
 Map<String, ColumnDescriptor> colDescMap = new HashMap<>();
 List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();
 for (ColumnDescriptor column : columns) {
  colDescMap.put(getFullColumnPath(column), column);
 }
 return colDescMap;
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

@Override
public ParquetValueWriter<?> primitive(PrimitiveType primitive) {
 ColumnDescriptor desc = type.getColumnDescription(currentPath());
 if (primitive.getOriginalType() != null) {
  switch (primitive.getOriginalType()) {
   case ENUM:
   case JSON:
    return ParquetValueWriters.unboxed(desc);
   case DECIMAL:
    DecimalMetadata decimal = primitive.getDecimalMetadata();
    switch (primitive.getPrimitiveTypeName()) {
     case INT32:

 private void checkEndOfRowGroup() throws IOException {
  if (rowsReturned != totalCountLoadedSoFar) return;
  PageReadStore pages = reader.readNextRowGroup();
  if (pages == null) {
   throw new IOException("expecting more rows but reached last block. Read "
     + rowsReturned + " out of " + totalRowCount);
  }
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  List<Type> types = requestedSchema.asGroupType().getFields();
  columnReaders = new VectorizedColumnReader[columns.size()];
  for (int i = 0; i < columns.size(); ++i) {
   if (missingColumns[i]) continue;
   columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(),
    pages.getPageReader(columns.get(i)), convertTz);
  }
  totalCountLoadedSoFar += pages.getRowCount();
 }
}

  "Failed to find related Parquet column descriptor with type " + type);
if (fileSchema.getColumns().contains(descriptors.get(0))) {
 return new VectorizedPrimitiveColumnReader(descriptors.get(0),
  pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo);
List<VectorizedColumnReader> fieldReaders = new ArrayList<>();
List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos();
List<Type> types = type.asGroupType().getFields();
for (int i = 0; i < fieldTypes.size(); i++) {
 VectorizedColumnReader r =
while (groupType.getFieldCount() < 2) {
 if (nestGroup > MAP_DEFINITION_LEVEL_MAX) {
  throw new RuntimeException(
      "Failed to get the field types for Map with type " + type);
 groupType = groupType.getFields().get(0).asGroupType();
 nestGroup++;

private Map<Integer, Converter> buildFieldToConverter(final MessageType schema) {
  final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount);
  int i = 0;
  for (final Type field : schema.getFields()) {
    if (field.isPrimitive()) {
      fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), new String[]{field.getName()}, field.getOriginalType()));
    } else {
      fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), new String[]{field.getName()}));
    }
    i++;
  }
  return fieldToConverter;
}

Javadoc

The root of a schema

Most used methods

Popular in Java

Finding current android device location
findViewById (Activity)
getSystemService (Context)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
BufferedInputStream (java.io)
A BufferedInputStream adds functionality to another input stream-namely, the ability to buffer the i
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
DateTimeFormat (org.joda.time.format)
Factory that creates instances of DateTimeFormatter from patterns and styles. Datetime formatting i
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Top PhpStorm plugins

How to useMessageType in org.apache.parquet.schema

Best Java code snippets using org.apache.parquet.schema.MessageType (Showing top 20 results out of 387)

Refine search

How to use
MessageType
in
org.apache.parquet.schema