public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves(); }
public static GroupColumnIO getMapKeyValueColumn(GroupColumnIO groupColumnIO) { while (groupColumnIO.getChildrenCount() == 1) { groupColumnIO = (GroupColumnIO) groupColumnIO.getChild(0); } return groupColumnIO; }
public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true); }
boolean required = columnIO.getType().getRepetition() != OPTIONAL; int repetitionLevel = columnIO.getRepetitionLevel(); int definitionLevel = columnIO.getDefinitionLevel(); if (ROW.equals(type.getTypeSignature().getBase())) { GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; NamedTypeSignature namedTypeSignature = fields.get(i).getNamedTypeSignature(); String name = namedTypeSignature.getName().get().toLowerCase(Locale.ENGLISH); Optional<Field> field = constructField(parameters.get(i), lookupColumnByName(groupColumnIO, name)); structHasParameters |= field.isPresent(); fieldsBuilder.add(field); MapType mapType = (MapType) type; GroupColumnIO keyValueColumnIO = getMapKeyValueColumn(groupColumnIO); if (keyValueColumnIO.getChildrenCount() != 2) { return Optional.empty(); Optional<Field> keyField = constructField(mapType.getKeyType(), keyValueColumnIO.getChild(0)); Optional<Field> valueField = constructField(mapType.getValueType(), keyValueColumnIO.getChild(1)); return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(keyField, valueField))); GroupColumnIO groupColumnIO = (GroupColumnIO) columnIO; List<Type> types = type.getTypeParameters(); if (groupColumnIO.getChildrenCount() != 1) { return Optional.empty(); Optional<Field> field = constructField(types.get(0), getArrayElementColumn(groupColumnIO.getChild(0))); return Optional.of(new GroupField(type, repetitionLevel, definitionLevel, required, ImmutableList.of(field)));
public static ColumnIO getArrayElementColumn(ColumnIO columnIO) { while (columnIO instanceof GroupColumnIO && !columnIO.getType().isRepetition(REPEATED)) { columnIO = ((GroupColumnIO) columnIO).getChild(0); } /* If array has a standard 3-level structure with middle level repeated group with a single field: * optional group my_list (LIST) { * repeated group element { * required binary str (UTF8); * }; * } */ if (columnIO instanceof GroupColumnIO && columnIO.getType().getOriginalType() == null && ((GroupColumnIO) columnIO).getChildrenCount() == 1 && !columnIO.getName().equals("array") && !columnIO.getName().equals(columnIO.getParent().getName() + "_tuple")) { return ((GroupColumnIO) columnIO).getChild(0); } /* Backward-compatibility support for 2-level arrays where a repeated field is not a group: * optional group my_list (LIST) { * repeated int32 element; * } */ return columnIO; }
public static ParquetEncoding getParquetEncoding(Encoding encoding) { switch (encoding) { case PLAIN: return ParquetEncoding.PLAIN; case RLE: return ParquetEncoding.RLE; case BIT_PACKED: return ParquetEncoding.BIT_PACKED; case PLAIN_DICTIONARY: return ParquetEncoding.PLAIN_DICTIONARY; case DELTA_BINARY_PACKED: return ParquetEncoding.DELTA_BINARY_PACKED; case DELTA_LENGTH_BYTE_ARRAY: return ParquetEncoding.DELTA_LENGTH_BYTE_ARRAY; case DELTA_BYTE_ARRAY: return ParquetEncoding.DELTA_BYTE_ARRAY; case RLE_DICTIONARY: return ParquetEncoding.RLE_DICTIONARY; default: throw new ParquetDecodingException("Unsupported Parquet encoding: " + encoding); } }
/** * Parquet column names are case-sensitive unlike Hive, which converts all column names to lowercase. * Therefore, when we look up columns we first check for exact match, and if that fails we look for a case-insensitive match. */ public static ColumnIO lookupColumnByName(GroupColumnIO groupColumnIO, String columnName) { ColumnIO columnIO = groupColumnIO.getChild(columnName); if (columnIO != null) { return columnIO; } for (int i = 0; i < groupColumnIO.getChildrenCount(); i++) { if (groupColumnIO.getChild(i).getName().equalsIgnoreCase(columnName)) { return groupColumnIO.getChild(i); } } return null; }
private void initializeColumnReaders() { for (PrimitiveColumnIO columnIO : columns) { RichColumnDescriptor column = new RichColumnDescriptor(columnIO.getColumnDescriptor(), columnIO.getType().asPrimitiveType()); columnReaders[columnIO.getId()] = PrimitiveColumnReader.createReader(column); } }
public static Optional<RichColumnDescriptor> getDescriptor(List<PrimitiveColumnIO> columns, List<String> path) { checkArgument(path.size() >= 1, "Parquet nested path should have at least one component"); int index = getPathIndex(columns, path); if (index == -1) { return Optional.empty(); } PrimitiveColumnIO columnIO = columns.get(index); return Optional.of(new RichColumnDescriptor(columnIO.getColumnDescriptor(), columnIO.getType().asPrimitiveType())); }
private static int getPathIndex(List<PrimitiveColumnIO> columns, List<String> path) { int maxLevel = path.size(); int index = -1; for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) { ColumnIO[] fields = columns.get(columnIndex).getPath(); if (fields.length <= maxLevel) { continue; } if (fields[maxLevel].getName().equalsIgnoreCase(path.get(maxLevel - 1))) { boolean match = true; for (int level = 0; level < maxLevel - 1; level++) { if (!fields[level + 1].getName().equalsIgnoreCase(path.get(level))) { match = false; } } if (match) { index = columnIndex; } } } return index; }
public ParquetReader(MessageColumnIO messageColumnIO, List<BlockMetaData> blocks, ParquetDataSource dataSource, AggregatedMemoryContext systemMemoryContext) { this.blocks = blocks; this.dataSource = requireNonNull(dataSource, "dataSource is null"); this.systemMemoryContext = requireNonNull(systemMemoryContext, "systemMemoryContext is null"); this.currentRowGroupMemoryContext = systemMemoryContext.newAggregatedMemoryContext(); columns = messageColumnIO.getLeaves(); columnReaders = new PrimitiveColumnReader[columns.size()]; }
fieldsBuilder.add(constructField(type, lookupColumnByName(messageColumnIO, columnName)));
private int readInt() { try { return decoder.readInt(); } catch (IOException e) { throw new ParquetDecodingException(e); } } }
@Override public int readLevel() { try { return delegate.readInt(); } catch (IOException e) { throw new ParquetDecodingException(e); } } }
private ValuesReader initDataReader(ParquetEncoding dataEncoding, byte[] bytes, int offset, int valueCount) { ValuesReader valuesReader; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException("Dictionary is missing for Page"); } valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary); } else { valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES); } try { valuesReader.initFromPage(valueCount, bytes, offset); return valuesReader; } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e); } } }
static int getMaxLevel(ColumnDescriptor descriptor, ValuesType valuesType) { switch (valuesType) { case REPETITION_LEVEL: return descriptor.getMaxRepetitionLevel(); case DEFINITION_LEVEL: return descriptor.getMaxDefinitionLevel(); case VALUES: if (descriptor.getType() == BOOLEAN) { return 1; } default: throw new ParquetDecodingException("Unsupported values type: " + valuesType); } }
public void setPageReader(PageReader pageReader) { this.pageReader = requireNonNull(pageReader, "pageReader"); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage); } catch (IOException e) { throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e); } } else { dictionary = null; } checkArgument(pageReader.getTotalValueCount() > 0, "page is empty"); totalValueCount = pageReader.getTotalValueCount(); }
@Override public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) throws IOException { switch (descriptor.getType()) { case BINARY: return new BinaryDictionary(dictionaryPage); case FIXED_LEN_BYTE_ARRAY: return new BinaryDictionary(dictionaryPage, descriptor.getTypeLength()); case INT96: return new BinaryDictionary(dictionaryPage, INT96_TYPE_LENGTH); case INT64: return new LongDictionary(dictionaryPage); case DOUBLE: return new DoubleDictionary(dictionaryPage); case INT32: return new IntegerDictionary(dictionaryPage); case FLOAT: return new FloatDictionary(dictionaryPage); default: throw new ParquetDecodingException("Dictionary encoding does not support: " + descriptor.getType()); } } },
@Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { switch (descriptor.getType()) { case BOOLEAN: return new BooleanPlainValuesReader(); case BINARY: return new BinaryPlainValuesReader(); case FLOAT: return new FloatPlainValuesReader(); case DOUBLE: return new DoublePlainValuesReader(); case INT32: return new IntegerPlainValuesReader(); case INT64: return new LongPlainValuesReader(); case INT96: return new FixedLenByteArrayPlainValuesReader(INT96_TYPE_LENGTH); case FIXED_LEN_BYTE_ARRAY: return new FixedLenByteArrayPlainValuesReader(descriptor.getTypeLength()); default: throw new ParquetDecodingException("Plain values reader does not support: " + descriptor.getType()); } }
private ValuesReader readPageV1(DataPageV1 page) { ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL); ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL); repetitionReader = new LevelValuesReader(rlReader); definitionReader = new LevelValuesReader(dlReader); try { byte[] bytes = page.getSlice().getBytes(); rlReader.initFromPage(page.getValueCount(), bytes, 0); int offset = rlReader.getNextOffset(); dlReader.initFromPage(page.getValueCount(), bytes, offset); offset = dlReader.getNextOffset(); return initDataReader(page.getValueEncoding(), bytes, offset, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e); } }