/** * Convert a Spark {@link DataType struct} to a {@link Type} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark DataType * @return the equivalent Type * @throws IllegalArgumentException if the type cannot be converted */ public static Type convert(DataType sparkType) { return visit(sparkType, new SparkTypeToType()); }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} with new field ids. * <p> * This conversion assigns fresh ids. * <p> * Some data types are represented as the same Spark type. These are converted to a default type. * <p> * To convert using a reference schema for field ids and ambiguous types, use * {@link #convert(Schema, StructType)}. * * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted */ public static Schema convert(StructType sparkType) { Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }
/** * Convert a Spark {@link StructType struct} to a {@link Schema} based on the given schema. * <p> * This conversion does not assign new ids; it uses ids from the base schema. * <p> * Data types, field order, and nullability will match the spark type. This conversion may return * a schema that is not compatible with base schema. * * @param baseSchema a Schema on which conversion is based * @param sparkType a Spark StructType * @return the equivalent Schema * @throws IllegalArgumentException if the type cannot be converted or there are missing ids */ public static Schema convert(Schema baseSchema, StructType sparkType) { // convert to a type with fresh ids Types.StructType struct = visit(sparkType, new SparkTypeToType(sparkType)).asStructType(); // reassign ids to match the base schema Schema schema = TypeUtil.reassignIds(new Schema(struct.fields()), baseSchema); // fix types that can't be represented in Spark (UUID and Fixed) return FixupTypes.fixup(schema, baseSchema); }
/** * Returns a {@link Schema} for the given table with fresh field ids. * <p> * This creates a Schema for an existing table by looking up the table's schema with Spark and * converting that schema. Spark/Hive partition columns are included in the schema. * * @param spark a Spark session * @param name a table name and (optional) database * @return a Schema for the table, if found */ public static Schema schemaForTable(SparkSession spark, String name) { StructType sparkType = spark.table(name).schema(); Type converted = visit(sparkType, new SparkTypeToType(sparkType)); return new Schema(converted.asNestedType().asStructType().fields()); }