@Override public Map<Integer, Accessor<InternalRow>> struct( Types.StructType struct, List<Map<Integer, Accessor<InternalRow>>> fieldResults) { Map<Integer, Accessor<InternalRow>> accessors = Maps.newHashMap(); List<Types.NestedField> fields = struct.fields(); for (int i = 0; i < fieldResults.size(); i += 1) { Types.NestedField field = fields.get(i); Map<Integer, Accessor<InternalRow>> result = fieldResults.get(i); if (result != null) { for (Map.Entry<Integer, Accessor<InternalRow>> entry : result.entrySet()) { accessors.put(entry.getKey(), newAccessor(i, field.isOptional(), field.type().asNestedType().asStructType(), entry.getValue())); } } else { accessors.put(field.fieldId(), newAccessor(i, field.type())); } } if (accessors.isEmpty()) { return null; } return accessors; }
@Override public Type schema(Schema schema, Type structResult) { Collection<Types.NestedField> newColumns = adds.get(TABLE_ROOT_ID); if (newColumns != null) { return addFields(structResult.asNestedType().asStructType(), newColumns); } return structResult; }
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public PartitionData(Schema schema) { this.partitionType = AvroSchemaUtil.convert(schema).asNestedType().asStructType(); this.size = partitionType.fields().size(); this.data = new Object[size]; this.stringSchema = schema.toString(); this.schema = schema; }
switch (type.typeId()) { case STRUCT: Types.StructType struct = type.asNestedType().asStructType(); List<VisitFieldFuture<T>> results = Lists .newArrayListWithExpectedSize(struct.fields().size());
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericPartitionFieldSummary(Schema avroSchema) { this.avroSchema = avroSchema; List<Types.NestedField> fields = AvroSchemaUtil.convert(avroSchema) .asNestedType() .asStructType() .fields(); List<Types.NestedField> allFields = PartitionFieldSummary.getType().fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } }
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericDataFile(org.apache.avro.Schema avroSchema) { this.avroSchema = avroSchema; Types.StructType schema = AvroSchemaUtil.convert(avroSchema).asNestedType().asStructType(); // partition type may be null if the field was not projected Type partType = schema.fieldType("partition"); if (partType != null) { this.partitionType = partType.asNestedType().asStructType(); } else { this.partitionType = EMPTY_STRUCT_TYPE; } List<Types.NestedField> fields = schema.fields(); List<Types.NestedField> allFields = DataFile.getType(partitionType).fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } this.partitionData = new PartitionData(partitionType); }
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericManifestFile(org.apache.avro.Schema avroSchema) { this.avroSchema = avroSchema; List<Types.NestedField> fields = AvroSchemaUtil.convert(avroSchema) .asNestedType() .asStructType() .fields(); List<Types.NestedField> allFields = ManifestFile.schema().asStruct().fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } }
@Override public Schema.Field field(Schema.Field field, Supplier<Schema> fieldResult) { Types.StructType struct = current.asNestedType().asStructType(); int fieldId = AvroSchemaUtil.getFieldId(field); Types.NestedField expectedField = struct.field(fieldId); // TODO: what if there are no ids? // if the field isn't present, it was not selected if (expectedField == null) { return null; } String expectedName = expectedField.name(); this.current = expectedField.type(); try { Schema schema = fieldResult.get(); if (schema != field.schema() || !expectedName.equals(field.name())) { // add an alias for the field return copyField(field, schema, expectedName); } else { // always copy because fields can't be reused return copyField(field, field.schema(), field.name()); } } finally { this.current = struct; } }
static void toJson(Type type, JsonGenerator generator) throws IOException { if (type.isPrimitiveType()) { toJson(type.asPrimitiveType(), generator); } else { Type.NestedType nested = type.asNestedType(); switch (type.typeId()) { case STRUCT: toJson(nested.asStructType(), generator); break; case LIST: toJson(nested.asListType(), generator); break; case MAP: toJson(nested.asMapType(), generator); break; default: throw new IllegalArgumentException("Cannot write unknown type: " + type); } } }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * <p> * The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @param filter a filters * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType, Expression filter) { Set<Integer> filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter)); return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) .asNestedType() .asStructType() .fields()); }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * <p> * The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @param filters a list of filters * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType, List<Expression> filters) { Set<Integer> filterRefs = Binder.boundReferences(schema.asStruct(), filters); return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) .asNestedType() .asStructType() .fields()); }
public Type field(NestedField field) { Type.Repetition repetition = field.isOptional() ? Type.Repetition.OPTIONAL : Type.Repetition.REQUIRED; int id = field.fieldId(); String name = field.name(); if (field.type().isPrimitiveType()) { return primitive(field.type().asPrimitiveType(), repetition, id, name); } else { NestedType nested = field.type().asNestedType(); if (nested.isStructType()) { return struct(nested.asStructType(), repetition, id, name); } else if (nested.isMapType()) { return map(nested.asMapType(), repetition, id, name); } else if (nested.isListType()) { return list(nested.asListType(), repetition, id, name); } throw new UnsupportedOperationException("Can't convert unknown type: " + nested); } }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ public static Schema convert(StructType sparkType) { Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType) { return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, ImmutableSet.of())) .asNestedType() .asStructType() .fields()); }
@Override public Type field(Types.NestedField field, Type fieldResult) { // the API validates deletes, updates, and additions don't conflict int fieldId = field.fieldId(); if (deletes.contains(fieldId)) { return null; } Types.NestedField update = updates.get(field.fieldId()); if (update != null && update.type() != field.type()) { // rename is handled in struct return update.type(); } Collection<Types.NestedField> newFields = adds.get(fieldId); if (newFields != null && !newFields.isEmpty()) { return addFields(fieldResult.asNestedType().asStructType(), newFields); } return fieldResult; }
/** * Returns a {@link Schema} for the given table with fresh field ids. * <p> * This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session * @param name a table name and (optional) database * @return a Schema for the table, if found */ public static Schema schemaForTable(SparkSession spark, String name) { StructType sparkType = spark.table(name).schema(); Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
@Test public void testStructs() throws Exception { Types.StructType struct = Types.StructType.of( Types.NestedField.required(34, "Name!", Types.StringType.get()), Types.NestedField.optional(35, "col", Types.DecimalType.of(38, 2))); Type copy = TestHelpers.roundTripSerialize(struct); Assert.assertEquals("Struct serialization should be equal to starting type", struct, copy); Type stringType = copy.asNestedType().asStructType().fieldType("Name!"); Assert.assertSame("Struct serialization should preserve identity type", Types.StringType.get(), stringType); Type decimalType = copy.asNestedType().asStructType().field(35).type(); Assert.assertEquals("Struct serialization should support id lookup", Types.DecimalType.of(38, 2), decimalType); }
private ConvertColumnFilterToParquet(Schema schema, String column) { super(schema); this.partitionStruct = schema.findField(column).type().asNestedType().asStructType(); }
private static Schema applyChanges(Schema schema, List<Integer> deletes, Map<Integer, Types.NestedField> updates, Multimap<Integer, Types.NestedField> adds) { Types.StructType struct = TypeUtil .visit(schema, new ApplyChanges(deletes, updates, adds)) .asNestedType().asStructType(); return new Schema(struct.fields()); }
public static Schema fromJson(JsonNode json) { Type type = typeFromJson(json); Preconditions.checkArgument(type.isNestedType() && type.asNestedType().isStructType(), "Cannot create schema, not a struct type: %s", type); return new Schema(type.asNestedType().asStructType().fields()); }