private StructType lazyType() { if (type == null) { this.type = convert(lazySchema()); } return type; }
@Override public ValueWriter<?> record(Schema record, List<String> names, List<ValueWriter<?>> fields) { List<DataType> types = Lists.newArrayList(); for (Schema.Field field : record.getFields()) { types.add(convert(schema.findType(getFieldId(field)))); } return SparkValueWriters.struct(fields, types); }
@Override public ValueWriter<?> map(Schema map, ValueWriter<?> valueReader) { Type keyType = schema.findType(AvroSchemaUtil.getKeyId(map)); Type valueType = schema.findType(AvroSchemaUtil.getValueId(map)); return SparkValueWriters.map( SparkValueWriters.strings(), convert(keyType), valueReader, convert(valueType)); }
@Override public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode, DataSourceOptions options) { Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode); Table table = findTable(options); Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct); List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema); if (!errors.isEmpty()) { StringBuilder sb = new StringBuilder(); sb.append("Cannot write incompatible dataframe to table with schema:\n") .append(table.schema()).append("\nProblems:"); for (String error : errors) { sb.append("\n* ").append(error); } throw new IllegalArgumentException(sb.toString()); } Optional<String> formatOption = options.get("iceberg.write.format"); FileFormat format; if (formatOption.isPresent()) { format = FileFormat.valueOf(formatOption.get().toUpperCase(Locale.ENGLISH)); } else { format = FileFormat.valueOf(table.properties() .getOrDefault(DEFAULT_FILE_FORMAT, DEFAULT_FILE_FORMAT_DEFAULT) .toUpperCase(Locale.ENGLISH)); } return Optional.of(new Writer(table, lazyConf(), format)); }
@Override public ValueWriter<?> array(Schema array, ValueWriter<?> elementWriter) { LogicalType logical = array.getLogicalType(); if (logical != null && "map".equals(logical.getName())) { Type keyType = schema.findType(getFieldId(array.getElementType().getField("key"))); Type valueType = schema.findType(getFieldId(array.getElementType().getField("value"))); ValueWriter<?>[] writers = ((SparkValueWriters.StructWriter) elementWriter).writers; return SparkValueWriters.arrayMap( writers[0], convert(keyType), writers[1], convert(valueType)); } Type elementType = schema.findType(AvroSchemaUtil.getElementId(array)); return SparkValueWriters.array(elementWriter, convert(elementType)); }
private static void assertEqualsUnsafe(Types.ListType list, Collection<?> expected, ArrayData actual) { Type elementType = list.elementType(); List<?> expectedElements = Lists.newArrayList(expected); for (int i = 0; i < expectedElements.size(); i += 1) { Object expectedValue = expectedElements.get(i); Object actualValue = actual.get(i, convert(elementType)); assertEqualsUnsafe(elementType, expectedValue, actualValue); } }
PartitionRowConverter(Schema partitionSchema, PartitionSpec spec) { StructType partitionType = SparkSchemaUtil.convert(partitionSchema); StructField[] fields = partitionType.fields(); this.types = new DataType[fields.length]; this.positions = new int[types.length]; this.javaTypes = new Class<?>[types.length]; this.reusedRow = new GenericInternalRow(types.length); List<PartitionField> partitionFields = spec.fields(); for (int rowIndex = 0; rowIndex < fields.length; rowIndex += 1) { this.types[rowIndex] = fields[rowIndex].dataType(); int sourceId = partitionSchema.columns().get(rowIndex).fieldId(); for (int specIndex = 0; specIndex < partitionFields.size(); specIndex += 1) { PartitionField field = spec.fields().get(specIndex); if (field.sourceId() == sourceId && "identity".equals(field.transform().toString())) { positions[rowIndex] = specIndex; javaTypes[rowIndex] = spec.javaClasses()[specIndex]; break; } } } }
private static void assertEqualsUnsafe(Types.MapType map, Map<?, ?> expected, MapData actual) { Type keyType = map.keyType(); Type valueType = map.valueType(); List<Map.Entry<?, ?>> expectedElements = Lists.newArrayList(expected.entrySet()); ArrayData actualKeys = actual.keyArray(); ArrayData actualValues = actual.valueArray(); for (int i = 0; i < expectedElements.size(); i += 1) { Map.Entry<?, ?> expectedPair = expectedElements.get(i); Object actualKey = actualKeys.get(i, convert(keyType)); Object actualValue = actualValues.get(i, convert(keyType)); assertEqualsUnsafe(keyType, expectedPair.getKey(), actualKey); assertEqualsUnsafe(valueType, expectedPair.getValue(), actualValue); } }
private static UnsafeProjection projection(Schema finalSchema, Schema readSchema) { StructType struct = convert(readSchema); List<AttributeReference> refs = seqAsJavaListConverter(struct.toAttributes()).asJava(); List<Attribute> attrs = Lists.newArrayListWithExpectedSize(struct.fields().length); List<org.apache.spark.sql.catalyst.expressions.Expression> exprs = Lists.newArrayListWithExpectedSize(struct.fields().length); for (AttributeReference ref : refs) { attrs.add(ref.toAttribute()); } for (Types.NestedField field : finalSchema.columns()) { int indexInReadSchema = struct.fieldIndex(field.name()); exprs.add(refs.get(indexInReadSchema)); } return UnsafeProjection.create( asScalaBufferConverter(exprs).asScala().toSeq(), asScalaBufferConverter(attrs).asScala().toSeq()); }
public static void assertEqualsUnsafe(Types.StructType struct, Record rec, InternalRow row) { List<Types.NestedField> fields = struct.fields(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); Object expectedValue = rec.get(i); Object actualValue = row.get(i, convert(fieldType)); assertEqualsUnsafe(fieldType, expectedValue, actualValue); } }
Schema requiredSchema = prune(tableSchema, convert(finalSchema), task.residual()); boolean hasJoinedPartitionColumns = !idColumns.isEmpty(); boolean hasExtraFilterColumns = requiredSchema.columns().size() != finalSchema.columns().size();
return spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(schema), false);
switch (format) { case PARQUET: String jsonSchema = convert(schema).json(); return Parquet.write(file) .writeSupport(new ParquetWriteSupport())