@Override public Set<String> discoverRootFields(Group obj) { return obj.getType() .getFields() .stream() .filter(Type::isPrimitive) .map(Type::getName) .collect(Collectors.toSet()); }
@Override public int length(final Object o) { if (o instanceof List) { return ((List) o).size(); } else if (o instanceof Group) { // both lists and maps are 'Group' type, but we should only have a group here in a map context Group g = (Group) o; return g.getType().getFields().size(); } else { return 0; } }
@Override public Collection<String> getPropertyKeys(final Object o) { if (o == null) { return Collections.emptySet(); } else if (o instanceof Map) { return ((Map<Object, Object>) o).keySet().stream().map(String::valueOf).collect(Collectors.toSet()); } else if (o instanceof Group) { return ((Group) o).getType().getFields().stream().map(f -> f.getName()).collect(Collectors.toSet()); } else { throw new UnsupportedOperationException(o.getClass().getName()); } }
/** * Searchs for a fieldName into a parquet GroupType by ignoring string case. * GroupType#getType(String fieldName) is case sensitive, so we use this method. * * @param groupType Group of field types where to search for fieldName * @param fieldName The field what we are searching * @return The Type object of the field found; null otherwise. */ private static Type getFieldTypeIgnoreCase(GroupType groupType, String fieldName) { for (Type type : groupType.getFields()) { if (type.getName().equalsIgnoreCase(fieldName)) { return type; } } return null; }
private PrimitiveType getElementType(Type type) { if (type.isPrimitive()) { return type.asPrimitiveType(); } if (type.asGroupType().getFields().size() > 1) { throw new RuntimeException( "Current Parquet Vectorization reader doesn't support nested type"); } return type.asGroupType().getFields().get(0).asGroupType().getFields().get(0) .asPrimitiveType(); }
/** * Convert a primitive group field to a "ingestion friendly" java object * * @return "ingestion ready" java object, or null */ @Nullable private static Object convertPrimitiveField(Group g, int fieldIndex, boolean binaryAsString) { PrimitiveType pt = (PrimitiveType) g.getType().getFields().get(fieldIndex); if (pt.isRepetition(Type.Repetition.REPEATED) && g.getFieldRepetitionCount(fieldIndex) > 1) { List<Object> vals = new ArrayList<>(); for (int i = 0; i < g.getFieldRepetitionCount(fieldIndex); i++) { vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); } return vals; } return convertPrimitiveField(g, fieldIndex, 0, binaryAsString); }
/** * check if a parquet type is a valid 'map' type */ private static boolean isLogicalMapType(Type groupType) { OriginalType ot = groupType.getOriginalType(); if (groupType.isPrimitive() || ot == null || groupType.isRepetition(Type.Repetition.REPEATED)) { return false; } if (groupType.getOriginalType().equals(OriginalType.MAP) || groupType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) { GroupType myMapType = groupType.asGroupType(); if (myMapType.getFieldCount() != 1 || myMapType.getFields().get(0).isPrimitive()) { return false; } GroupType mapItemType = myMapType.getFields().get(0).asGroupType(); return mapItemType.isRepetition(Type.Repetition.REPEATED) && mapItemType.getFieldCount() == 2 && mapItemType.getFields().get(0).getName().equalsIgnoreCase("key") && mapItemType.getFields().get(0).isPrimitive() && mapItemType.getFields().get(1).getName().equalsIgnoreCase("value"); } return false; }
/** * convert a repeated field into a list of primitives or groups */ private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) { Type t = g.getType().getFields().get(fieldIndex); assert t.getRepetition().equals(Type.Repetition.REPEATED); int repeated = g.getFieldRepetitionCount(fieldIndex); List<Object> vals = new ArrayList<>(); for (int i = 0; i < repeated; i++) { if (t.isPrimitive()) { vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); } else { vals.add(g.getGroup(fieldIndex, i)); } } return vals; }
private boolean isSubType( final GroupType groupType, final Type subtype) { if (subtype.isPrimitive() || subtype.isRepetition(Type.Repetition.REPEATED)) { return groupType.getFields().contains(subtype); } else { for (Type g : groupType.getFields()) { if (!g.isPrimitive() && g.getName().equals(subtype.getName())) { // check all elements are contained in g boolean containsAll = false; for (Type subSubType : subtype.asGroupType().getFields()) { containsAll = isSubType(g.asGroupType(), subSubType); if (!containsAll) { break; } } if (containsAll) { return containsAll; } } } return false; } }
if (type instanceof GroupType) { GroupType groupType = type.asGroupType(); List<Type> ts = projectLeafTypes(groupType.getFields(), f.getNodes()); GroupType g = buildProjectedGroupType(groupType, ts); if (g != null) {
boolean isListItemPrimitive = g.getType().getFields().get(0).isPrimitive(); List<Object> vals = new ArrayList<>();
/** * check if a parquet type is a valid 'list' type */ private static boolean isLogicalListType(Type listType) { return !listType.isPrimitive() && listType.getOriginalType() != null && listType.getOriginalType().equals(OriginalType.LIST) && listType.asGroupType().getFieldCount() == 1 && listType.asGroupType().getFields().get(0).isRepetition(Type.Repetition.REPEATED); }
static boolean isWrappedListPrimitive(Object o) { if (o instanceof Group) { Group g = (Group) o; return g.getType().isRepetition(Type.Repetition.REPEATED) && !g.getType().isPrimitive() && g.getType().asGroupType().getFieldCount() == 1 && g.getType().getFields().get(0).isPrimitive(); } return false; }
List<VectorizedColumnReader> fieldReaders = new ArrayList<>(); List<TypeInfo> fieldTypes = structTypeInfo.getAllStructFieldTypeInfos(); List<Type> types = type.asGroupType().getFields(); for (int i = 0; i < fieldTypes.size(); i++) { VectorizedColumnReader r = "Failed to get the field types for Map with type " + type); groupType = groupType.getFields().get(0).asGroupType(); nestGroup++; List<Type> kvTypes = groupType.getFields(); VectorizedListColumnReader keyListColumnReader = new VectorizedListColumnReader( descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion,
private void init(final GroupType selectedGroupType, final ConverterParent parent, final int index, final GroupType containingGroupType, TypeInfo hiveTypeInfo) { if (parent != null) { setMetadata(parent.getMetadata()); } final int selectedFieldCount = selectedGroupType.getFieldCount(); converters = new Converter[selectedFieldCount]; this.repeatedConverters = new ArrayList<Repeated>(); if (hiveTypeInfo != null && hiveTypeInfo.getCategory().equals(ObjectInspector.Category.STRUCT)) { this.hiveFieldNames = ((StructTypeInfo) hiveTypeInfo).getAllStructFieldNames(); this.hiveFieldTypeInfos = ((StructTypeInfo) hiveTypeInfo).getAllStructFieldTypeInfos(); } List<Type> selectedFields = selectedGroupType.getFields(); for (int i = 0; i < selectedFieldCount; i++) { Type subtype = selectedFields.get(i); if (isSubType(containingGroupType, subtype)) { int fieldIndex = containingGroupType.getFieldIndex(subtype.getName()); TypeInfo _hiveTypeInfo = getFieldTypeIgnoreCase(hiveTypeInfo, subtype.getName(), fieldIndex); converters[i] = getFieldConverter(subtype, fieldIndex, _hiveTypeInfo); } else { throw new IllegalStateException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); } } }
Type fieldType = g.getType().getFields().get(fieldIndex);
private static Object convertPrimitiveField(Group g, int fieldIndex, int index, boolean binaryAsString) PrimitiveType pt = (PrimitiveType) g.getType().getFields().get(fieldIndex); OriginalType ot = pt.getOriginalType();
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<Type> types = requestedSchema.asGroupType().getFields(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), types.get(i).getOriginalType(), pages.getPageReader(columns.get(i)), convertTz); } totalCountLoadedSoFar += pages.getRowCount(); } }