/** * convert a repeated field into a list of primitives or groups */ private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) { Type t = g.getType().getFields().get(fieldIndex); assert t.getRepetition().equals(Type.Repetition.REPEATED); int repeated = g.getFieldRepetitionCount(fieldIndex); List<Object> vals = new ArrayList<>(); for (int i = 0; i < repeated; i++) { if (t.isPrimitive()) { vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); } else { vals.add(g.getGroup(fieldIndex, i)); } } return vals; }
private static boolean isElementType(Type repeatedType, String parentName) { if (repeatedType.isPrimitive() || (repeatedType.asGroupType().getFieldCount() != 1)) { return true; } else if (repeatedType.getName().equals("array")) { return true; // existing avro data } else if (repeatedType.getName().equals(parentName + "_tuple")) { return true; // existing thrift data } // false for the following cases: // * name is "list", which matches the spec // * name is "bag", which indicates existing hive or pig data // * ambiguous case, which should be assumed is 3-level according to spec return false; } }
private PrimitiveType getElementType(Type type) { if (type.isPrimitive()) { return type.asPrimitiveType(); } if (type.asGroupType().getFields().size() > 1) { throw new RuntimeException( "Current Parquet Vectorization reader doesn't support nested type"); } return type.asGroupType().getFields().get(0).asGroupType().getFields().get(0) .asPrimitiveType(); }
/** * check if a parquet type is a valid 'map' type */ private static boolean isLogicalMapType(Type groupType) { OriginalType ot = groupType.getOriginalType(); if (groupType.isPrimitive() || ot == null || groupType.isRepetition(Type.Repetition.REPEATED)) { return false; } if (groupType.getOriginalType().equals(OriginalType.MAP) || groupType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) { GroupType myMapType = groupType.asGroupType(); if (myMapType.getFieldCount() != 1 || myMapType.getFields().get(0).isPrimitive()) { return false; } GroupType mapItemType = myMapType.getFields().get(0).asGroupType(); return mapItemType.isRepetition(Type.Repetition.REPEATED) && mapItemType.getFieldCount() == 2 && mapItemType.getFields().get(0).getName().equalsIgnoreCase("key") && mapItemType.getFields().get(0).isPrimitive() && mapItemType.getFields().get(1).getName().equalsIgnoreCase("value"); } return false; }
protected static Converter getConverterFromDescription(Type type, int index, ConverterParent parent, TypeInfo hiveTypeInfo) { if (type == null) { return null; } if (type.isPrimitive()) { return getConverterFromDescription(type.asPrimitiveType(), index, parent, hiveTypeInfo); } return getConverterFromDescription(type.asGroupType(), index, parent, hiveTypeInfo); }
schemaTypes.add(schema.getType(i)); } else { if (t.isPrimitive()) {
boolean isListItemPrimitive = g.getType().getFields().get(0).isPrimitive(); List<Object> vals = new ArrayList<>();
/** * check if a parquet type is a valid 'list' type */ private static boolean isLogicalListType(Type listType) { return !listType.isPrimitive() && listType.getOriginalType() != null && listType.getOriginalType().equals(OriginalType.LIST) && listType.asGroupType().getFieldCount() == 1 && listType.asGroupType().getFields().get(0).isRepetition(Type.Repetition.REPEATED); }
private boolean isSubType( final GroupType groupType, final Type subtype) { if (subtype.isPrimitive() || subtype.isRepetition(Type.Repetition.REPEATED)) { return groupType.getFields().contains(subtype); } else { for (Type g : groupType.getFields()) { if (!g.isPrimitive() && g.getName().equals(subtype.getName())) { // check all elements are contained in g boolean containsAll = false; for (Type subSubType : subtype.asGroupType().getFields()) { containsAll = isSubType(g.asGroupType(), subSubType); if (!containsAll) { break; } } if (containsAll) { return containsAll; } } } return false; } }
static boolean isWrappedListPrimitive(Object o) { if (o instanceof Group) { Group g = (Group) o; return g.getType().isRepetition(Type.Repetition.REPEATED) && !g.getType().isPrimitive() && g.getType().asGroupType().getFieldCount() == 1 && g.getType().getFields().get(0).isPrimitive(); } return false; }
private Converter getFieldConverter(Type type, int fieldIndex, TypeInfo hiveTypeInfo) { Converter converter; if (type.isRepetition(Type.Repetition.REPEATED)) { if (type.isPrimitive()) { converter = new Repeated.RepeatedPrimitiveConverter( type.asPrimitiveType(), this, fieldIndex, hiveTypeInfo); } else { converter = new Repeated.RepeatedGroupConverter( type.asGroupType(), this, fieldIndex, hiveTypeInfo == null ? null : ((ListTypeInfo) hiveTypeInfo) .getListElementTypeInfo()); } repeatedConverters.add((Repeated) converter); } else { converter = getConverterFromDescription(type, fieldIndex, this, hiveTypeInfo); } return converter; }
/** * Prints the given group in the row of Parquet file. * * @param g The given group. */ private static void printGroup(Group g) { int fieldCnt = g.getType().getFieldCount(); for (int field = 0; field < fieldCnt; field++) { int valCnt = g.getFieldRepetitionCount(field); Type fieldType = g.getType().getType(field); String fieldName = fieldType.getName(); for (int idx = 0; idx < valCnt; idx++) { if (fieldType.isPrimitive()) System.out.println(fieldName + " " + g.getValueToString(field, idx)); else printGroup(g.getGroup(field, idx)); } } System.out.println(); }
if (fieldType.isPrimitive()) {
if (elemType.getCategory() == ObjectInspector.Category.STRUCT) { Type subFieldType = fieldType.asGroupType().getType(0); if (!subFieldType.isPrimitive()) { String subFieldName = subFieldType.getName(); Text name = new Text(subFieldName);
if (type.isPrimitive()) { checkInspectorCategory(inspector, ObjectInspector.Category.PRIMITIVE); PrimitiveObjectInspector primitiveInspector = (PrimitiveObjectInspector)inspector;
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private void initializeInternal() throws IOException, UnsupportedOperationException { /** * Check that the requested schema is supported. */ missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
private OriginalType getOriginalType(Type type, String[] path, int depth) { if (type.isPrimitive()) { return type.getOriginalType(); } Type t = ((GroupType) type).getType(path[depth]); return getOriginalType(t, path, depth + 1); }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }