Refine search
/** * check if a parquet type is a valid 'map' type */ private static boolean isLogicalMapType(Type groupType) { OriginalType ot = groupType.getOriginalType(); if (groupType.isPrimitive() || ot == null || groupType.isRepetition(Type.Repetition.REPEATED)) { return false; } if (groupType.getOriginalType().equals(OriginalType.MAP) || groupType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) { GroupType myMapType = groupType.asGroupType(); if (myMapType.getFieldCount() != 1 || myMapType.getFields().get(0).isPrimitive()) { return false; } GroupType mapItemType = myMapType.getFields().get(0).asGroupType(); return mapItemType.isRepetition(Type.Repetition.REPEATED) && mapItemType.getFieldCount() == 2 && mapItemType.getFields().get(0).getName().equalsIgnoreCase("key") && mapItemType.getFields().get(0).isPrimitive() && mapItemType.getFields().get(1).getName().equalsIgnoreCase("value"); } return false; }
case STRUCT: List<Type> groupFields = getProjectedGroupFields( fieldType.asGroupType(), ((StructTypeInfo) colType).getAllStructFieldNames(), ((StructTypeInfo) colType).getAllStructFieldTypeInfos() return Types.buildGroup(fieldType.getRepetition()) .addFields(typesArray) .named(fieldType.getName()); case LIST: TypeInfo elemType = ((ListTypeInfo) colType).getListElementTypeInfo(); if (elemType.getCategory() == ObjectInspector.Category.STRUCT) { Type subFieldType = fieldType.asGroupType().getType(0); if (!subFieldType.isPrimitive()) { String subFieldName = subFieldType.getName(); Text name = new Text(subFieldName); if (name.equals(ParquetHiveSerDe.ARRAY) || name.equals(ParquetHiveSerDe.LIST)) { subFieldType = new GroupType(Repetition.REPEATED, subFieldName, getProjectedType(elemType, subFieldType.asGroupType().getType(0))); } else { subFieldType = getProjectedType(elemType, subFieldType);
public ListDataWriter(ListObjectInspector inspector, GroupType groupType) { this.inspector = inspector; // Get the internal array structure GroupType repeatedType = groupType.getType(0).asGroupType(); this.repeatedGroupName = repeatedType.getName(); Type elementType = repeatedType.getType(0); this.elementName = elementType.getName(); ObjectInspector elementInspector = this.inspector.getListElementObjectInspector(); this.elementWriter = createWriter(elementInspector, elementType); }
private static GroupType buildProjectedGroupType( GroupType originalType, List<Type> types) { if (types == null || types.isEmpty()) { return null; } return new GroupType(originalType.getRepetition(), originalType.getName(), types); }
static boolean isWrappedListPrimitive(Object o) { if (o instanceof Group) { Group g = (Group) o; return g.getType().isRepetition(Type.Repetition.REPEATED) && !g.getType().isPrimitive() && g.getType().asGroupType().getFieldCount() == 1 && g.getType().getFields().get(0).isPrimitive(); } return false; }
/** * check if a parquet type is a valid 'list' type */ private static boolean isLogicalListType(Type listType) { return !listType.isPrimitive() && listType.getOriginalType() != null && listType.getOriginalType().equals(OriginalType.LIST) && listType.asGroupType().getFieldCount() == 1 && listType.asGroupType().getFields().get(0).isRepetition(Type.Repetition.REPEATED); }
this.parent = parent; this.avroSchema = avroSchema; int schemaSize = parquetSchema.getFieldCount(); this.converters = new Converter[schemaSize]; this.specificClass = getDatumClass(baseModel, avroSchema); for (Schema.Field field: avroSchema.getFields()) { avroFieldIndexes.put(field.name(), avroFieldIndex++); for (Type parquetField: parquetSchema.getFields()) { Schema.Field avroField = getAvroField(parquetField.getName()); Schema nonNullSchema = AvroSchemaConverter.getNonNull(avroField.schema()); final int finalAvroIndex = avroFieldIndexes.remove(avroField.name()); Schema.Field field = avroSchema.getField(fieldName); if (field.schema().getType() == Schema.Type.NULL) { continue; // skip null since Parquet does not write nulls
public AvroUnionConverter(ParentValueContainer parent, Type parquetSchema, Schema avroSchema, GenericData model) { this.parent = parent; GroupType parquetGroup = parquetSchema.asGroupType(); this.memberConverters = new Converter[ parquetGroup.getFieldCount()]; int parquetIndex = 0; for (int index = 0; index < avroSchema.getTypes().size(); index++) { Schema memberSchema = avroSchema.getTypes().get(index); if (!memberSchema.getType().equals(Schema.Type.NULL)) { Type memberType = parquetGroup.getType(parquetIndex); memberConverters[parquetIndex] = newConverter(memberSchema, memberType, model, new ParentValueContainer() { @Override public void add(Object value) { Preconditions.checkArgument(memberValue==null, "Union is resolving to more than one type"); memberValue = value; } }); parquetIndex++; // Note for nulls the parquetIndex id not increased } } }
private void writeRecordFields(GroupType schema, Schema avroSchema, Object record) { List<Type> fields = schema.getFields(); List<Schema.Field> avroFields = avroSchema.getFields(); int index = 0; // parquet ignores Avro nulls, so index may differ for (int avroIndex = 0; avroIndex < avroFields.size(); avroIndex++) { Schema.Field avroField = avroFields.get(avroIndex); if (avroField.schema().getType().equals(Schema.Type.NULL)) { continue; } Type fieldType = fields.get(index); Object value = model.getField(record, avroField.name(), avroIndex); if (value != null) { recordConsumer.startField(fieldType.getName(), index); writeValue(fieldType, avroField.schema(), value); recordConsumer.endField(fieldType.getName(), index); } else if (fieldType.isRepetition(Type.Repetition.REQUIRED)) { throw new RuntimeException("Null-value for required field: " + avroField.name()); } index++; } }
OriginalType originalType = parquetGroupType.getOriginalType(); if (originalType != null) { switch(originalType) { case LIST: if (parquetGroupType.getFieldCount()!= 1) { throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); Type repeatedType = parquetGroupType.getType(0); if (isElementType(repeatedType, parquetGroupType.getName())) { return Schema.createArray(convertField(repeatedType)); } else { Type elementType = repeatedType.asGroupType().getType(0); if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { return Schema.createArray(optional(convertField(elementType))); } else { return Schema.createArray(convertField(elementType)); if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType(); if (!mapKeyValType.isRepetition(REPEATED) || mapKeyValType.getFieldCount()!=2) { throw new UnsupportedOperationException("Invalid map type " + parquetGroupType); Type keyType = mapKeyValType.getType(0); if (!keyType.isPrimitive() ||
private void writeUnion(GroupType parquetSchema, Schema avroSchema, Object value) { recordConsumer.startGroup(); // ResolveUnion will tell us which of the union member types to // deserialise. int avroIndex = model.resolveUnion(avroSchema, value); // For parquet's schema we skip nulls GroupType parquetGroup = parquetSchema.asGroupType(); int parquetIndex = avroIndex; for (int i = 0; i < avroIndex; i++) { if (avroSchema.getTypes().get(i).getType().equals(Schema.Type.NULL)) { parquetIndex--; } } // TODO: what if the value is null? // Sparsely populated method of encoding unions, each member has its own // set of columns. String memberName = "member" + parquetIndex; recordConsumer.startField(memberName, parquetIndex); writeValue(parquetGroup.getType(parquetIndex), avroSchema.getTypes().get(avroIndex), value); recordConsumer.endField(memberName, parquetIndex); recordConsumer.endGroup(); }
private Type convertUnion(String fieldName, Schema schema, Type.Repetition repetition) { List<Schema> nonNullSchemas = new ArrayList<Schema>(schema.getTypes().size()); for (Schema childSchema : schema.getTypes()) { if (childSchema.getType().equals(Schema.Type.NULL)) { if (Type.Repetition.REQUIRED == repetition) { repetition = Type.Repetition.OPTIONAL; } } else { nonNullSchemas.add(childSchema); } } // If we only get a null and one other type then its a simple optional field // otherwise construct a union container switch (nonNullSchemas.size()) { case 0: throw new UnsupportedOperationException("Cannot convert Avro union of only nulls"); case 1: return convertField(fieldName, nonNullSchemas.get(0), repetition); default: // complex union type List<Type> unionTypes = new ArrayList<Type>(nonNullSchemas.size()); int index = 0; for (Schema childSchema : nonNullSchemas) { unionTypes.add( convertField("member" + index++, childSchema, Type.Repetition.OPTIONAL)); } return new GroupType(repetition, fieldName, unionTypes); } }
/** * Returns whether the given type is the element type of a list or is a * synthetic group with one field that is the element type. This is * determined by checking whether the type can be a synthetic group and by * checking whether a potential synthetic group matches the expected schema. * <p> * Unlike {@link AvroSchemaConverter#isElementType(Type, String)}, this * method never guesses because the expected schema is known. * * @param repeatedType a type that may be the element type * @param elementSchema the expected Schema for list elements * @return {@code true} if the repeatedType is the element schema */ static boolean isElementType(Type repeatedType, Schema elementSchema) { if (repeatedType.isPrimitive() || repeatedType.asGroupType().getFieldCount() > 1 || repeatedType.asGroupType().getType(0).isRepetition(REPEATED)) { // The repeated type must be the element type because it is an invalid // synthetic wrapper. Must be a group with one optional or required field return true; } else if (elementSchema != null && elementSchema.getType() == Schema.Type.RECORD) { Schema schemaFromRepeated = CONVERTER.convert(repeatedType.asGroupType()); if (checkReaderWriterCompatibility(elementSchema, schemaFromRepeated) .getType() == COMPATIBLE) { return true; } } return false; }
public AvroArrayConverter(ParentValueContainer parent, GroupType type, Schema avroSchema, GenericData model, Class<?> arrayClass) { this.parent = parent; this.avroSchema = avroSchema; Preconditions.checkArgument(arrayClass.isArray(), "Cannot convert non-array: " + arrayClass.getName()); this.elementClass = arrayClass.getComponentType(); ParentValueContainer setter = createSetterAndContainer(); Schema elementSchema = this.avroSchema.getElementType(); Type repeatedType = type.getType(0); // always determine whether the repeated type is the element type by // matching it against the element schema. if (isElementType(repeatedType, elementSchema)) { // the element type is the repeated type (and required) converter = newConverter(elementSchema, repeatedType, model, elementClass, setter); } else { // the element is wrapped in a synthetic group and may be optional converter = new ArrayElementConverter( repeatedType.asGroupType(), elementSchema, model, setter); } }
OriginalType annotation = group.getOriginalType(); if (annotation != null) { switch (annotation) { case LIST: Preconditions.checkArgument(!group.isRepetition(REPEATED), "Invalid list: top-level group is repeated: " + group); Preconditions.checkArgument(group.getFieldCount() == 1, "Invalid list: does not contain single repeated field: " + group); GroupType repeatedElement = group.getFields().get(0).asGroupType(); Preconditions.checkArgument(repeatedElement.isRepetition(REPEATED), "Invalid list: inner group is not repeated"); Preconditions.checkArgument(repeatedElement.getFieldCount() <= 1, "Invalid list: repeated group is not a single field: " + group); visitor.fieldNames.push(repeatedElement.getName()); try { T elementResult = null; if (repeatedElement.getFieldCount() > 0) { elementResult = visitField(repeatedElement.getType(0), visitor); Preconditions.checkArgument(!group.isRepetition(REPEATED), "Invalid map: top-level group is repeated: " + group); Preconditions.checkArgument(group.getFieldCount() == 1, "Invalid map: does not contain single repeated field: " + group); GroupType repeatedKeyValue = group.getType(0).asGroupType(); Preconditions.checkArgument(repeatedKeyValue.isRepetition(REPEATED), "Invalid map: inner group is not repeated");
private <V> void writeMap(GroupType schema, Schema avroSchema, Map<CharSequence, V> map) { GroupType innerGroup = schema.getType(0).asGroupType(); Type keyType = innerGroup.getType(0); Type valueType = innerGroup.getType(1); recordConsumer.startGroup(); // group wrapper (original type MAP) if (map.size() > 0) { recordConsumer.startField(MAP_REPEATED_NAME, 0); for (Map.Entry<CharSequence, V> entry : map.entrySet()) { recordConsumer.startGroup(); // repeated group key_value, middle layer recordConsumer.startField(MAP_KEY_NAME, 0); writeValue(keyType, MAP_KEY_SCHEMA, entry.getKey()); recordConsumer.endField(MAP_KEY_NAME, 0); V value = entry.getValue(); if (value != null) { recordConsumer.startField(MAP_VALUE_NAME, 1); writeValue(valueType, avroSchema.getValueType(), value); recordConsumer.endField(MAP_VALUE_NAME, 1); } else if (!valueType.isRepetition(Type.Repetition.OPTIONAL)) { throw new RuntimeException("Null map value for " + avroSchema.getName()); } recordConsumer.endGroup(); } recordConsumer.endField(MAP_REPEATED_NAME, 0); } recordConsumer.endGroup(); }
@Override public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array, ParquetValueReader<?> elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; return new ArrayReader<>(repeatedD, repeatedR, option(elementType, elementD, elementReader)); }
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }
private static boolean isElementType(Type repeatedType, String parentName) { if (repeatedType.isPrimitive() || (repeatedType.asGroupType().getFieldCount() != 1)) { return true; } else if (repeatedType.getName().equals("array")) { return true; // existing avro data } else if (repeatedType.getName().equals(parentName + "_tuple")) { return true; // existing thrift data } // false for the following cases: // * name is "list", which matches the spec // * name is "bag", which indicates existing hive or pig data // * ambiguous case, which should be assumed is 3-level according to spec return false; } }