/** * check if a parquet type is a valid 'list' type */ private static boolean isLogicalListType(Type listType) { return !listType.isPrimitive() && listType.getOriginalType() != null && listType.getOriginalType().equals(OriginalType.LIST) && listType.asGroupType().getFieldCount() == 1 && listType.asGroupType().getFields().get(0).isRepetition(Type.Repetition.REPEATED); }
/** * check if a parquet type is a valid 'map' type */ private static boolean isLogicalMapType(Type groupType) { OriginalType ot = groupType.getOriginalType(); if (groupType.isPrimitive() || ot == null || groupType.isRepetition(Type.Repetition.REPEATED)) { return false; } if (groupType.getOriginalType().equals(OriginalType.MAP) || groupType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) { GroupType myMapType = groupType.asGroupType(); if (myMapType.getFieldCount() != 1 || myMapType.getFields().get(0).isPrimitive()) { return false; } GroupType mapItemType = myMapType.getFields().get(0).asGroupType(); return mapItemType.isRepetition(Type.Repetition.REPEATED) && mapItemType.getFieldCount() == 2 && mapItemType.getFields().get(0).getName().equalsIgnoreCase("key") && mapItemType.getFields().get(0).isPrimitive() && mapItemType.getFields().get(1).getName().equalsIgnoreCase("value"); } return false; }
private void readTimestamp(int total, TimestampColumnVector c, int rowId) throws IOException { int left = total; while (left > 0) { readRepetitionAndDefinitionLevels(); if (definitionLevel >= maxDefLevel) { switch (descriptor.getType()) { //INT64 is not yet supported case INT96: c.set(rowId, dataColumn.readTimestamp().toSqlTimestamp()); break; default: throw new IOException( "Unsupported parquet logical type: " + type.getOriginalType() + " for timestamp"); } c.isNull[rowId] = false; c.isRepeating = c.isRepeating && ((c.time[0] == c.time[rowId]) && (c.nanos[0] == c.nanos[rowId])); } else { setNullValue(c, rowId); } rowId++; left--; } }
public static void testConversion( final String columnNamesStr, final String columnsTypeStr, final String actualSchema) throws Exception { final List<String> columnNames = createHiveColumnsFrom(columnNamesStr); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); final MessageType expectedMT = MessageTypeParser.parseMessageType(actualSchema); assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + actualSchema, expectedMT, messageTypeFound); // Required to check the original types manually as PrimitiveType.equals does not care about it List<Type> expectedFields = expectedMT.getFields(); List<Type> actualFields = messageTypeFound.getFields(); for (int i = 0, n = expectedFields.size(); i < n; ++i) { OriginalType exp = expectedFields.get(i).getOriginalType(); OriginalType act = actualFields.get(i).getOriginalType(); assertEquals("Original types of the field do not match", exp, act); } } }
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
OriginalType originalType = type.getOriginalType();
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }
private ColumnMappingType getColumnType(Type type) { OriginalType originalType = type.getOriginalType(); if (originalType == null) { String primitiveType = type.asPrimitiveType().getPrimitiveTypeName().toString(); if (primitiveType.equals("BOOLEAN")) return ColumnMappingType.Boolean; if (primitiveType.equals("DECIMAL")) return ColumnMappingType.Number; if (primitiveType.equals("DOUBLE")) return ColumnMappingType.Number; if (primitiveType.equals("INT96")) return ColumnMappingType.Date; if (primitiveType.indexOf("INT") == 0) return ColumnMappingType.Number; if (primitiveType.indexOf("UINT") == 0) return ColumnMappingType.Number; return ColumnMappingType.Unknown; } String original = originalType.toString(); if (original.equals("DATE")) return ColumnMappingType.Date; if (original.equals("TIMESTAMP_MILLIS")) return ColumnMappingType.DateTime; if (original.indexOf("UTF") == 0) return ColumnMappingType.String; return ColumnMappingType.Unknown; }
private OriginalType getOriginalType(Type type, String[] path, int depth) { if (type.isPrimitive()) { return type.getOriginalType(); } Type t = ((GroupType) type).getType(path[depth]); return getOriginalType(t, path, depth + 1); }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }
private Map<Integer, Converter> buildFieldToConverter(final MessageType schema) { final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount); int i = 0; for (final Type field : schema.getFields()) { if (field.isPrimitive()) { fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), new String[]{field.getName()}, field.getOriginalType())); } else { fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), new String[]{field.getName()})); } i++; } return fieldToConverter; }
/** * Converts {@link ColumnDescriptor} to {@link SchemaPath} and converts any parquet LOGICAL LIST to something * the execution engine can understand (removes the extra 'list' and 'element' fields from the name) */ private static SchemaPath convertColumnDescriptor(final MessageType schema, final ColumnDescriptor columnDescriptor) { List<String> path = Lists.newArrayList(columnDescriptor.getPath()); // go through the path and find all logical lists int index = 0; Type type = schema; while (!type.isPrimitive()) { // don't bother checking the last element in the path as it is a primitive type type = type.asGroupType().getType(path.get(index)); if (type.getOriginalType() == OriginalType.LIST && LogicalListL1Converter.isSupportedSchema(type.asGroupType())) { // remove 'list' type = type.asGroupType().getType(path.get(index+1)); path.remove(index+1); // remove 'element' type = type.asGroupType().getType(path.get(index+1)); path.remove(index+1); } index++; } String[] schemaColDesc = new String[path.size()]; path.toArray(schemaColDesc); return SchemaPath.getCompoundPath(schemaColDesc); }
/** * {@inheritDoc} */ @Override protected boolean equals(Type otherType) { return !otherType.isPrimitive() && super.equals(otherType) && getOriginalType() == otherType.getOriginalType() && getFields().equals(otherType.asGroupType().getFields()); }
private Map<Integer, Converter> buildFieldToConverter(final GroupType schema) { final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount); int i = 0; for (final Type field : schema.getFields()) { final String[] newColumnPath = new String[columnPath.length + 1]; int j = 0; for (final String part : columnPath) { newColumnPath[j] = part; j++; } newColumnPath[j] = field.getName(); if (field.isPrimitive()) { fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), newColumnPath, field.getOriginalType())); } else { fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), newColumnPath)); } i++; } return fieldToConverter; }
@Override protected Type union(Type toMerge, boolean strict) { if (toMerge.isPrimitive()) { throw new IncompatibleSchemaModificationException("can not merge primitive type " + toMerge + " into group type " + this); } return new GroupType(toMerge.getRepetition(), getName(), toMerge.getOriginalType(), mergeFields(toMerge.asGroupType()), getId()); }
/** * Checks if the schema is similar to the following: * <pre> * optional group <name> (LIST) { * repeated group <list-name> { * <element-repetition> <element-type> <element-name>; * } * } * </pre> * * @param schema parquet group type * @return true is supported */ static boolean isSupportedSchema(GroupType schema) { if (schema.getFieldCount() == 1) { Type type = schema.getType(0); // check: repeated group if (type.isPrimitive() || !type.isRepetition(REPEATED) || type.getOriginalType() != null) { return false; } return type.asGroupType().getFieldCount() == 1; } return false; }
private void verifyParquetSchema() { ParquetReader reader = new ParquetReader(tableDirPath); MessageType parquetSchema = reader.readParquetSchema(); String[] types = configuration.getTypes(); for (int i = 0; i < types.length; i ++) { String type = types[i]; if (isNumericSqlType(type)) { OriginalType parquetFieldType = parquetSchema.getFields().get(i).getOriginalType(); assertEquals(OriginalType.DECIMAL, parquetFieldType); } } }
private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) { if (type.isPrimitive()) { PrimitiveType primitiveType = (PrimitiveType) type; int precision = 0; int scale = 0; if (primitiveType.getDecimalMetadata() != null) { precision = primitiveType.getDecimalMetadata().getPrecision(); scale = primitiveType.getDecimalMetadata().getScale(); } int repetitionLevel = schema.getMaxRepetitionLevel(path); int definitionLevel = schema.getMaxDefinitionLevel(path); return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel); } Type t = ((GroupType) type).getType(path[depth]); return getColTypeInfo(schema, t, path, depth + 1); }
/** * Changes the list inner '$data$' vector name to 'element' in the schema */ private Type renameChildTypeToElement(Type childType) { if (childType.isPrimitive()) { PrimitiveType childPrimitiveType = childType.asPrimitiveType(); return new PrimitiveType(childType.getRepetition(), childPrimitiveType.getPrimitiveTypeName(), childPrimitiveType.getTypeLength(), "element", childPrimitiveType.getOriginalType(), childPrimitiveType.getDecimalMetadata(), null); } else { GroupType childGroupType = childType.asGroupType(); return new GroupType(childType.getRepetition(), "element", childType.getOriginalType(), childGroupType.getFields()); } }