@Override public Schema record(Schema record, List<String> names, Iterable<Schema.Field> schemaIterable) { Preconditions.checkArgument( current.isNestedType() && current.asNestedType().isStructType(), "Cannot project non-struct: %s", current); Types.StructType struct = current.asNestedType().asStructType();
public static <T> T visit(Type type, SchemaVisitor<T> visitor) { switch (type.typeId()) { case STRUCT: Types.StructType struct = type.asNestedType().asStructType(); List<T> results = Lists.newArrayListWithExpectedSize(struct.fields().size()); for (Types.NestedField field : struct.fields()) { Types.ListType list = type.asNestedType().asListType(); T elementResult; Types.MapType map = type.asNestedType().asMapType(); T keyResult; T valueResult;
public static Schema select(Schema schema, Set<Integer> fieldIds) { Preconditions.checkNotNull(schema, "Schema cannot be null"); Preconditions.checkNotNull(fieldIds, "Field ids cannot be null"); Type result = visit(schema, new PruneColumns(fieldIds)); if (schema.asStruct() == result) { return schema; } else if (result != null) { if (schema.getAliases() != null) { return new Schema(result.asNestedType().fields(), schema.getAliases()); } else { return new Schema(result.asNestedType().fields()); } } return new Schema(ImmutableList.of(), schema.getAliases()); }
Assert.assertTrue("Should expect a Record", expected instanceof Record); Assert.assertTrue("Should be a Row", actual instanceof Row); assertEqualsSafe(type.asNestedType().asStructType(), (Record) expected, (Row) actual); break; case LIST: Assert.assertTrue("Should be a Seq", actual instanceof Seq); List<?> asList = seqAsJavaListConverter((Seq<?>) actual).asJava(); assertEqualsSafe(type.asNestedType().asListType(), (Collection) expected, asList); break; case MAP: Map<String, ?> asMap = mapAsJavaMapConverter( (scala.collection.Map<String, ?>) actual).asJava(); assertEqualsSafe(type.asNestedType().asMapType(), (Map<String, ?>) expected, asMap); break; case TIME:
return primitive; }).asNestedType().asStructType().fields());
Assert.assertTrue("Should expect a Record", expected instanceof Record); Assert.assertTrue("Should be an InternalRow", actual instanceof InternalRow); assertEqualsUnsafe(type.asNestedType().asStructType(), (Record) expected, (InternalRow) actual); break; case LIST: Assert.assertTrue("Should expect a Collection", expected instanceof Collection); Assert.assertTrue("Should be an ArrayData", actual instanceof ArrayData); assertEqualsUnsafe(type.asNestedType().asListType(), (Collection) expected, (ArrayData) actual); break; case MAP: Assert.assertTrue("Should expect a Map", expected instanceof Map); Assert.assertTrue("Should be an ArrayBasedMapData", actual instanceof MapData); assertEqualsUnsafe(type.asNestedType().asMapType(), (Map) expected, (MapData) actual); break; case TIME:
(current.isMapType() && isKeyValueSchema(array.getElementType()))) { Preconditions.checkArgument(current.isMapType(), "Incompatible projected type: %s", current); Types.MapType m = current.asNestedType().asMapType(); this.current = Types.StructType.of(m.fields()); // create a struct to correspond to element try { Preconditions.checkArgument(current.isListType(), "Incompatible projected type: %s", current); Types.ListType list = current.asNestedType().asListType(); this.current = list.elementType(); try {
switch (type.typeId()) { case STRUCT: Types.StructType struct = type.asNestedType().asStructType(); List<VisitFieldFuture<T>> results = Lists .newArrayListWithExpectedSize(struct.fields().size()); Types.ListType list = type.asNestedType().asListType(); return visitor.list(list, new VisitFuture<>(list.elementType(), visitor)); Types.MapType map = type.asNestedType().asMapType(); return visitor.map(map, new VisitFuture<>(map.keyType(), visitor),
if (parentType.isNestedType()) { Type.NestedType nested = parentType.asNestedType(); if (nested.isMapType()) { parentField = nested.asMapType().fields().get(1); } else if (nested.isListType()) { parentField = nested.asListType().fields().get(0); parentField.type().isNestedType() && parentField.type().asNestedType().isStructType(), "Cannot add to non-struct column: %s: %s", parent, parentField.type()); parentId = parentField.fieldId();
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericDataFile(org.apache.avro.Schema avroSchema) { this.avroSchema = avroSchema; Types.StructType schema = AvroSchemaUtil.convert(avroSchema).asNestedType().asStructType(); // partition type may be null if the field was not projected Type partType = schema.fieldType("partition"); if (partType != null) { this.partitionType = partType.asNestedType().asStructType(); } else { this.partitionType = EMPTY_STRUCT_TYPE; } List<Types.NestedField> fields = schema.fields(); List<Types.NestedField> allFields = DataFile.getType(partitionType).fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } this.partitionData = new PartitionData(partitionType); }
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericPartitionFieldSummary(Schema avroSchema) { this.avroSchema = avroSchema; List<Types.NestedField> fields = AvroSchemaUtil.convert(avroSchema) .asNestedType() .asStructType() .fields(); List<Types.NestedField> allFields = PartitionFieldSummary.getType().fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } }
/** * Used by Avro reflection to instantiate this class when reading manifest files. */ public GenericManifestFile(org.apache.avro.Schema avroSchema) { this.avroSchema = avroSchema; List<Types.NestedField> fields = AvroSchemaUtil.convert(avroSchema) .asNestedType() .asStructType() .fields(); List<Types.NestedField> allFields = ManifestFile.schema().asStruct().fields(); this.fromProjectionPos = new int[fields.size()]; for (int i = 0; i < fromProjectionPos.length; i += 1) { boolean found = false; for (int j = 0; j < allFields.size(); j += 1) { if (fields.get(i).fieldId() == allFields.get(j).fieldId()) { found = true; fromProjectionPos[i] = j; } } if (!found) { throw new IllegalArgumentException("Cannot find projected field: " + fields.get(i)); } } }
@Override public Schema.Field field(Schema.Field field, Supplier<Schema> fieldResult) { Types.StructType struct = current.asNestedType().asStructType(); int fieldId = AvroSchemaUtil.getFieldId(field); Types.NestedField expectedField = struct.field(fieldId); // TODO: what if there are no ids? // if the field isn't present, it was not selected if (expectedField == null) { return null; } String expectedName = expectedField.name(); this.current = expectedField.type(); try { Schema schema = fieldResult.get(); if (schema != field.schema() || !expectedName.equals(field.name())) { // add an alias for the field return copyField(field, schema, expectedName); } else { // always copy because fields can't be reused return copyField(field, field.schema(), field.name()); } } finally { this.current = struct; } }
@Override public Map<Integer, Accessor<InternalRow>> struct( Types.StructType struct, List<Map<Integer, Accessor<InternalRow>>> fieldResults) { Map<Integer, Accessor<InternalRow>> accessors = Maps.newHashMap(); List<Types.NestedField> fields = struct.fields(); for (int i = 0; i < fieldResults.size(); i += 1) { Types.NestedField field = fields.get(i); Map<Integer, Accessor<InternalRow>> result = fieldResults.get(i); if (result != null) { for (Map.Entry<Integer, Accessor<InternalRow>> entry : result.entrySet()) { accessors.put(entry.getKey(), newAccessor(i, field.isOptional(), field.type().asNestedType().asStructType(), entry.getValue())); } } else { accessors.put(field.fieldId(), newAccessor(i, field.type())); } } if (accessors.isEmpty()) { return null; } return accessors; }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * <p> * The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @param filter a filters * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType, Expression filter) { Set<Integer> filterRefs = Binder.boundReferences(schema.asStruct(), Collections.singletonList(filter)); return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) .asNestedType() .asStructType() .fields()); }
@Override public List<String> map(Types.MapType readMap, Supplier<List<String>> keyErrors, Supplier<List<String>> valueErrors) { if (!currentType.isMapType()) { return ImmutableList.of(String.format(": %s cannot be read as a map", currentType)); } Types.MapType map = currentType.asNestedType().asMapType(); List<String> errors = Lists.newArrayList(); try { if (readMap.isValueRequired() && map.isValueOptional()) { errors.add(": values should be required, but are optional"); } this.currentType = map.keyType(); errors.addAll(keyErrors.get()); this.currentType = map.valueType(); errors.addAll(valueErrors.get()); return errors; } finally { this.currentType = map; } }
/** * Prune columns from a {@link Schema} using a {@link StructType Spark type} projection. * <p> * This requires that the Spark type is a projection of the Schema. Nullability and types must * match. * <p> * The filters list of {@link Expression} is used to ensure that columns referenced by filters * are projected. * * @param schema a Schema * @param requestedType a projection of the Spark representation of the Schema * @param filters a list of filters * @return a Schema corresponding to the Spark projection * @throws IllegalArgumentException if the Spark type does not match the Schema */ public static Schema prune(Schema schema, StructType requestedType, List<Expression> filters) { Set<Integer> filterRefs = Binder.boundReferences(schema.asStruct(), filters); return new Schema(visit(schema, new PruneColumnsWithoutReordering(requestedType, filterRefs)) .asNestedType() .asStructType() .fields()); }
static void toJson(Type type, JsonGenerator generator) throws IOException { if (type.isPrimitiveType()) { toJson(type.asPrimitiveType(), generator); } else { Type.NestedType nested = type.asNestedType(); switch (type.typeId()) { case STRUCT: toJson(nested.asStructType(), generator); break; case LIST: toJson(nested.asListType(), generator); break; case MAP: toJson(nested.asMapType(), generator); break; default: throw new IllegalArgumentException("Cannot write unknown type: " + type); } } }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ public static Schema convert(StructType sparkType) { Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
@Override public Type field(Types.NestedField field, Type fieldResult) { // the API validates deletes, updates, and additions don't conflict int fieldId = field.fieldId(); if (deletes.contains(fieldId)) { return null; } Types.NestedField update = updates.get(field.fieldId()); if (update != null && update.type() != field.type()) { // rename is handled in struct return update.type(); } Collection<Types.NestedField> newFields = adds.get(fieldId); if (newFields != null && !newFields.isEmpty()) { return addFields(fieldResult.asNestedType().asStructType(), newFields); } return fieldResult; }