private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void initializeInternal() throws IOException, UnsupportedOperationException { /** * Check that the requested schema is supported. */ missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
public List<String[]> getPaths() { return this.getPaths(0); }
public List<String[]> getPaths() { return this.getPaths(0); }
public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain<ColumnDescriptor> parquetTupleDomain, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableList.Builder<RichColumnDescriptor> columnReferences = ImmutableList.builder(); for (String[] paths : requestedSchema.getPaths()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths)); if (descriptor != null) { columnReferences.add(descriptor); } } return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build()); }
public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain<ColumnDescriptor> parquetTupleDomain, Map<List<String>, RichColumnDescriptor> descriptorsByPath) { ImmutableList.Builder<RichColumnDescriptor> columnReferences = ImmutableList.builder(); for (String[] paths : requestedSchema.getPaths()) { RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths)); if (descriptor != null) { columnReferences.add(descriptor); } } return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build()); }
public static Map<List<String>, RichColumnDescriptor> getDescriptors(MessageType fileSchema, MessageType requestedSchema) { Map<List<String>, RichColumnDescriptor> descriptorsByPath = new HashMap<>(); List<PrimitiveColumnIO> columns = getColumns(fileSchema, requestedSchema); for (String[] paths : fileSchema.getPaths()) { List<String> columnPath = Arrays.asList(paths); getDescriptor(columns, columnPath) .ifPresent(richColumnDescriptor -> descriptorsByPath.put(columnPath, richColumnDescriptor)); } return descriptorsByPath; }
private List<ColumnOrder> getColumnOrders(MessageType schema) { List<ColumnOrder> columnOrders = new ArrayList<>(); // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { ColumnOrder columnOrder = new ColumnOrder(); columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); columnOrders.add(columnOrder); } return columnOrders; }
private List<ColumnOrder> getColumnOrders(MessageType schema) { List<ColumnOrder> columnOrders = new ArrayList<>(); // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders. for (int i = 0, n = schema.getPaths().size(); i < n; ++i) { ColumnOrder columnOrder = new ColumnOrder(); columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER); columnOrders.add(columnOrder); } return columnOrders; }
public static Map<List<String>, RichColumnDescriptor> getDescriptors(MessageType fileSchema, MessageType requestedSchema) { Map<List<String>, RichColumnDescriptor> descriptorsByPath = new HashMap<>(); List<PrimitiveColumnIO> columns = getColumns(fileSchema, requestedSchema); for (String[] paths : fileSchema.getPaths()) { List<String> columnPath = Arrays.asList(paths); getDescriptor(columns, columnPath) .ifPresent(richColumnDescriptor -> descriptorsByPath.put(columnPath, richColumnDescriptor)); } return descriptorsByPath; }
public List<ColumnDescriptor> getColumns() { List<String[]> paths = this.getPaths(0); List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size()); for (String[] path : paths) { // TODO: optimize this PrimitiveType primitiveType = getType(path).asPrimitiveType(); columns.add(new ColumnDescriptor( path, primitiveType, getMaxRepetitionLevel(path), getMaxDefinitionLevel(path))); } return columns; }
public List<ColumnDescriptor> getColumns() { List<String[]> paths = this.getPaths(0); List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size()); for (String[] path : paths) { // TODO: optimize this PrimitiveType primitiveType = getType(path).asPrimitiveType(); columns.add(new ColumnDescriptor( path, primitiveType, getMaxRepetitionLevel(path), getMaxDefinitionLevel(path))); } return columns; }
private static byte[] roughGuessTypes(MessageType messageType) { byte[] types = new byte[messageType.getPaths().size()]; for (int i = 0; i < types.length; i++) { Type parquetType = messageType.getType(i); assert parquetType.isPrimitive(); switch (parquetType.asPrimitiveType().getPrimitiveTypeName()) { case INT32: case BOOLEAN: case FLOAT: case DOUBLE: types[i] = Vec.T_NUM; break; case INT96: types[i] = Vec.T_TIME; break; case INT64: types[i] = OriginalType.TIMESTAMP_MILLIS.equals(parquetType.getOriginalType()) ? Vec.T_TIME : Vec.T_NUM; break; default: types[i] = Vec.T_BAD; } } return types; }
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
private static void checkCompatibility(ParquetMetadata metadata) { // make sure we can map Parquet blocks to Chunks for (BlockMetaData block : metadata.getBlocks()) { if (block.getRowCount() > Integer.MAX_VALUE) { IcedHashMapGeneric.IcedHashMapStringObject dbg = new IcedHashMapGeneric.IcedHashMapStringObject(); dbg.put("startingPos", block.getStartingPos()); dbg.put("rowCount", block.getRowCount()); throw new H2OUnsupportedDataFileException("Unsupported Parquet file (technical limitation).", "Current implementation doesn't support Parquet files with blocks larger than " + Integer.MAX_VALUE + " rows.", dbg); // because we map each block to a single H2O Chunk } } // check that file doesn't have nested structures MessageType schema = metadata.getFileMetaData().getSchema(); for (String[] path : schema.getPaths()) if (path.length != 1) { throw new H2OUnsupportedDataFileException("Parquet files with nested structures are not supported.", "Detected a column with a nested structure " + Arrays.asList(path)); } }
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }