private List<ColumnDescriptor> getAllColumnDescriptorByType( int depth, Type type, List<ColumnDescriptor> columns) throws ParquetRuntimeException { List<ColumnDescriptor> res = new ArrayList<>(); for (ColumnDescriptor descriptor : columns) { if (depth >= descriptor.getPath().length) { throw new InvalidSchemaException("Corrupted Parquet schema"); } if (type.getName().equals(descriptor.getPath()[depth])) { res.add(descriptor); } } return res; }
private Path wrapPathForCache(Path path, Object fileKey, JobConf configuration, List<BlockMetaData> blocks, String tag) throws IOException { if (fileKey == null || cache == null) { return path; } HashSet<ColumnPath> includedCols = new HashSet<>(); for (ColumnDescriptor col : requestedSchema.getColumns()) { includedCols.add(ColumnPath.get(col.getPath())); } // We could make some assumptions given how the reader currently does the work (consecutive // chunks, etc.; blocks and columns stored in offset order in the lists), but we won't - // just save all the chunk boundaries and lengths for now. TreeMap<Long, Long> chunkIndex = new TreeMap<>(); for (BlockMetaData block : blocks) { for (ColumnChunkMetaData mc : block.getColumns()) { if (!includedCols.contains(mc.getPath())) continue; chunkIndex.put(mc.getStartingPos(), mc.getStartingPos() + mc.getTotalSize()); } } // Register the cache-aware path so that Parquet reader would go thru it. configuration.set("fs." + LlapCacheAwareFs.SCHEME + ".impl", LlapCacheAwareFs.class.getCanonicalName()); path = LlapCacheAwareFs.registerFile(cache, path, fileKey, chunkIndex, configuration, tag); this.cacheFsPath = path; return path; }
/** * Helper function to construct exception for parquet schema mismatch. */ private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException( ColumnDescriptor descriptor, WritableColumnVector column) { return new SchemaColumnConvertNotSupportedException( Arrays.toString(descriptor.getPath()), descriptor.getPrimitiveType().getPrimitiveTypeName().toString(), column.dataType().catalogString()); }
/** * Helper function to construct exception for parquet schema mismatch. */ private SchemaColumnConvertNotSupportedException constructConvertNotSupportedException( ColumnDescriptor descriptor, WritableColumnVector column) { return new SchemaColumnConvertNotSupportedException( Arrays.toString(descriptor.getPath()), descriptor.getPrimitiveType().getPrimitiveTypeName().toString(), column.dataType().catalogString()); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); for (Entry<ColumnDescriptor, ColumnWriterV2> entry : columns.entrySet()) { sb.append(Arrays.toString(entry.getKey().getPath())).append(": "); sb.append(entry.getValue().getTotalBufferedSize()).append(" bytes"); sb.append("\n"); } return sb.toString(); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); for (Entry<ColumnDescriptor, ColumnWriterV1> entry : columns.entrySet()) { sb.append(Arrays.toString(entry.getKey().getPath())).append(": "); sb.append(entry.getValue().getBufferedSizeInMemory()).append(" bytes"); sb.append("\n"); } return sb.toString(); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); for (Entry<ColumnDescriptor, ColumnWriterBase> entry : columns.entrySet()) { sb.append(Arrays.toString(entry.getKey().getPath())).append(": "); sb.append(entry.getValue().getTotalBufferedSize()).append(" bytes"); sb.append("\n"); } return sb.toString(); }
public void setRequestedSchema(MessageType projection) { paths.clear(); for (ColumnDescriptor col : projection.getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
@Override public RecordFilter bind(Iterable<ColumnReader> readers) { for (ColumnReader reader : readers) { if ( Arrays.equals( reader.getDescriptor().getPath(), filterPath)) { return new ColumnRecordFilter(reader, predicate); } } throw new IllegalArgumentException( "Column " + columnPath + " does not exist."); } };
public void setRequestedSchema(MessageType projection) { paths.clear(); for (ColumnDescriptor col : projection.getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
@Override public RecordFilter bind(Iterable<ColumnReader> readers) { for (ColumnReader reader : readers) { if ( Arrays.equals( reader.getDescriptor().getPath(), filterPath)) { return new ColumnRecordFilter(reader, predicate); } } throw new IllegalArgumentException( "Column " + columnPath + " does not exist."); } };
private SchemaCompatibilityValidator(MessageType schema) { for (ColumnDescriptor cd : schema.getColumns()) { ColumnPath columnPath = ColumnPath.get(cd.getPath()); columnsAccordingToSchema.put(columnPath, cd); } }
Optional<PageReader> readColumnInBlock(int blockIndex, ColumnDescriptor columnDescriptor) { BlockMetaData block = blocks.get(blockIndex); if (block.getRowCount() == 0) { throw new RuntimeException("Illegal row group of 0 rows"); } Optional<ColumnChunkMetaData> mc = findColumnByPath(block, columnDescriptor.getPath()); return mc.map(column -> new ChunkDescriptor(columnDescriptor, column, column.getStartingPos(), (int) column.getTotalSize())) .map(chunk -> readChunk(f, chunk)); }
public IncrementallyUpdatedFilterPredicateBuilderBase(List<PrimitiveColumnIO> leaves) { for (PrimitiveColumnIO leaf : leaves) { ColumnDescriptor descriptor = leaf.getColumnDescriptor(); ColumnPath path = ColumnPath.get(descriptor.getPath()); PrimitiveComparator<?> comparator = descriptor.getPrimitiveType().comparator(); comparatorsByColumn.put(path, comparator); } }
public RichColumnDescriptor( ColumnDescriptor descriptor, PrimitiveType primitiveType) { super(descriptor.getPath(), primitiveType.getPrimitiveTypeName(), primitiveType.getTypeLength(), descriptor.getMaxRepetitionLevel(), descriptor.getMaxDefinitionLevel()); this.primitiveType = primitiveType; this.required = primitiveType.getRepetition() != OPTIONAL; }
private ColumnChunkMetaData getColumnChunkMetaData(ColumnDescriptor columnDescriptor) throws IOException { for (ColumnChunkMetaData metadata : currentBlockMetadata.getColumns()) { if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) { return metadata; } } throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor); }
private PrimitiveConverter getPrimitiveConverter(ColumnDescriptor path) { Type currentType = schema; Converter currentConverter = recordConverter; for (String fieldName : path.getPath()) { final GroupType groupType = currentType.asGroupType(); int fieldIndex = groupType.getFieldIndex(fieldName); currentType = groupType.getType(fieldName); currentConverter = currentConverter.asGroupConverter().getConverter(fieldIndex); } PrimitiveConverter converter = currentConverter.asPrimitiveConverter(); return converter; }
private PrimitiveConverter getPrimitiveConverter(ColumnDescriptor path) { Type currentType = schema; Converter currentConverter = recordConverter; for (String fieldName : path.getPath()) { final GroupType groupType = currentType.asGroupType(); int fieldIndex = groupType.getFieldIndex(fieldName); currentType = groupType.getType(fieldName); currentConverter = currentConverter.asGroupConverter().getConverter(fieldIndex); } PrimitiveConverter converter = currentConverter.asPrimitiveConverter(); return converter; }
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException { this.converter = new ParquetMetadataConverter(options); this.file = file; this.f = file.newStream(); this.options = options; this.footer = readFooter(file, options, f, converter); this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
public void resolveDrillType(Map<String, SchemaElement> schemaElements, OptionManager options) { se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column)); type = ParquetToDrillTypeConverter.toMajorType(column.getType(), column.getTypeLength(), getDataMode(column), se, options); field = MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(), type); length = getDataTypeLength(); }