public static parquet.schema.Type getParquetTypeByName(String columnName, MessageType messageType) { if (messageType.containsField(columnName)) { return messageType.getType(columnName); } // parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase // check for direct match above but if no match found, try case-insensitive match for (parquet.schema.Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(columnName)) { return type; } } return null; }
public static parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) { if (useParquetColumnNames) { return getParquetTypeByName(column.getName(), messageType); } if (column.getHiveColumnIndex() < messageType.getFieldCount()) { return messageType.getType(column.getHiveColumnIndex()); } return null; } }
.toArray(String[]::new); ColumnPath columnPath = ColumnPath.get(path); PrimitiveTypeName primitiveTypeName = messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(); ColumnChunkMetaData column = ColumnChunkMetaData.get( columnPath,
public Type getType(String ... path) { return getType(path, 0); }
public Type getType(String ... path) { return getType(path, 0); }
private static List<Mapping> computeMappingByPosition( DataModelDescriptor target, MessageType source) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format( "Mapping columns by their position: model={0}", //$NON-NLS-1$ target.getDataModelClass().getName())); } List<ColumnDescriptor> sources = source.getColumns(); List<? extends PropertyDescriptor> targets = target.getPropertyDescriptors(); List<Mapping> mappings = new ArrayList<>(); int limit = Math.min(sources.size(), targets.size()); for (int i = 0; i < limit; i++) { ColumnDescriptor s = sources.get(i); Type sType = source.getType(s.getPath()); PropertyDescriptor t = targets.get(i); mappings.add(new Mapping(s, sType, t)); } for (int i = limit, n = sources.size(); i < n; i++) { ColumnDescriptor s = sources.get(i); Type sType = source.getType(s.getPath()); mappings.add(new Mapping(s, sType, null)); } for (int i = limit, n = targets.size(); i < n; i++) { mappings.add(new Mapping(null, null, targets.get(i))); } return mappings; }
private static parquet.schema.Type getParquetTypeByName(String columnName, MessageType messageType) { if (messageType.containsField(columnName)) { return messageType.getType(columnName); } // parquet is case-sensitive, but hive is not. all hive columns get converted to lowercase // check for direct match above but if no match found, try case-insensitive match for (Type type : messageType.getFields()) { if (type.getName().equalsIgnoreCase(columnName)) { return type; } } return null; } }
public List<ColumnDescriptor> getColumns() { List<String[]> paths = this.getPaths(0); List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size()); for (String[] path : paths) { // TODO: optimize this columns.add(new ColumnDescriptor(path, getType(path).asPrimitiveType().getPrimitiveTypeName(), getMaxRepetitionLevel(path), getMaxDefinitionLevel(path))); } return columns; }
private SchemaCompatibilityValidator(MessageType schema) { for (ColumnDescriptor cd : schema.getColumns()) { ColumnPath columnPath = ColumnPath.get(cd.getPath()); columnsAccordingToSchema.put(columnPath, cd); OriginalType ot = schema.getType(cd.getPath()).getOriginalType(); if (ot != null) { originalTypes.put(columnPath, ot); } } }
/** * Searchs column names by index on a given Parquet file schema, and returns its corresponded * Parquet schema types. * * @param schema Message schema where to search for column names. * @param colNames List of column names. * @param colIndexes List of column indexes. * @return A MessageType object of the column names found. */ private static MessageType getSchemaByIndex(MessageType schema, List<String> colNames, List<Integer> colIndexes) { List<Type> schemaTypes = new ArrayList<Type>(); for (Integer i : colIndexes) { if (i < colNames.size()) { if (i < schema.getFieldCount()) { schemaTypes.add(schema.getType(i)); } else { //prefixing with '_mask_' to ensure no conflict with named //columns in the file schema schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i))); } } } return new MessageType(schema.getName(), schemaTypes); }
private static List<Mapping> computeMappingByName( DataModelDescriptor target, MessageType source) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format( "Mapping columns by their name: model={0}", //$NON-NLS-1$ target.getDataModelClass().getName())); } Set<PropertyDescriptor> rest = new LinkedHashSet<>(target.getPropertyDescriptors()); List<Mapping> mappings = new ArrayList<>(); for (ColumnDescriptor s : source.getColumns()) { String name = s.getPath()[0]; Type sType = source.getType(s.getPath()); PropertyDescriptor t = target.findPropertyDescriptor(name); if (t != null) { mappings.add(new Mapping(s, sType, t)); rest.remove(t); } else { mappings.add(new Mapping(s, sType, null)); } } for (PropertyDescriptor t : rest) { mappings.add(new Mapping(null, null, t)); } return mappings; }
public List<ColumnDescriptor> getColumns() { List<String[]> paths = this.getPaths(0); List<ColumnDescriptor> columns = new ArrayList<ColumnDescriptor>(paths.size()); for (String[] path : paths) { // TODO: optimize this PrimitiveType primitiveType = getType(path).asPrimitiveType(); columns.add(new ColumnDescriptor( path, primitiveType.getPrimitiveTypeName(), primitiveType.getTypeLength(), getMaxRepetitionLevel(path), getMaxDefinitionLevel(path))); } return columns; }
public SchemaIntersection(MessageType fileSchema, Fields requestedFields) { if(requestedFields == Fields.UNKNOWN) requestedFields = Fields.ALL; Fields newFields = Fields.NONE; List<Type> newSchemaFields = new ArrayList<Type>(); int schemaSize = fileSchema.getFieldCount(); for (int i = 0; i < schemaSize; i++) { Type type = fileSchema.getType(i); Fields name = new Fields(type.getName()); if(requestedFields.contains(name)) { newFields = newFields.append(name); newSchemaFields.add(type); } } this.sourceFields = newFields; this.requestedSchema = new MessageType(fileSchema.getName(), newSchemaFields); }
public static parquet.schema.Type getParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames) { if (useParquetColumnNames) { return getParquetTypeByName(column.getName(), messageType); } if (column.getHiveColumnIndex() < messageType.getFieldCount()) { return messageType.getType(column.getHiveColumnIndex()); } return null; }
public ColumnDescriptor getColumnDescription(String[] path) { int maxRep = getMaxRepetitionLevel(path); int maxDef = getMaxDefinitionLevel(path); PrimitiveTypeName type = getType(path).asPrimitiveType().getPrimitiveTypeName(); return new ColumnDescriptor(path, type, maxRep, maxDef); }
public ColumnDescriptor getColumnDescription(String[] path) { int maxRep = getMaxRepetitionLevel(path); int maxDef = getMaxDefinitionLevel(path); PrimitiveType type = getType(path).asPrimitiveType(); return new ColumnDescriptor(path, type.getPrimitiveTypeName(), type.getTypeLength(), maxRep, maxDef); }
ColumnChunkMetaData column = ColumnChunkMetaData.get( columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset,
ColumnChunkMetaData column = ColumnChunkMetaData.get( path, messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), fromFormatEncodings(metaData.encodings), fromParquetStatistics(metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset,
@Test public void testNestedTypes() { MessageType schema = MessageTypeParser.parseMessageType(Paper.schema.toString()); Type type = schema.getType("Links", "Backward"); assertEquals(PrimitiveTypeName.INT64, type.asPrimitiveType().getPrimitiveTypeName()); assertEquals(0, schema.getMaxRepetitionLevel("DocId")); assertEquals(1, schema.getMaxRepetitionLevel("Name")); assertEquals(2, schema.getMaxRepetitionLevel("Name", "Language")); assertEquals(0, schema.getMaxDefinitionLevel("DocId")); assertEquals(1, schema.getMaxDefinitionLevel("Links")); assertEquals(2, schema.getMaxDefinitionLevel("Links", "Backward")); } }