@Override public RecordMaterializer<GenericRecord> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext ) { // coercing this value to false by default here to be friendlier default behavior // see https://github.com/apache/incubator-druid/issues/5433#issuecomment-388539306 String jobProp = "parquet.avro.add-list-element-records"; Boolean explicitlySet = configuration.getBoolean(jobProp, false); if (!explicitlySet) { configuration.setBoolean(jobProp, false); } MessageType parquetSchema = readContext.getRequestedSchema(); Schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema); Class<? extends AvroDataSupplier> suppClass = configuration.getClass( AVRO_DATA_SUPPLIER, SpecificDataSupplier.class, AvroDataSupplier.class ); AvroDataSupplier supplier = ReflectionUtils.newInstance(suppClass, configuration); return new AvroRecordMaterializer<>(parquetSchema, avroSchema, supplier.get()); } }
{ final String schemaLocation = "/tmp/avro_format.json"; final Schema avroSchema = new Schema.Parser().parse(new File(schemaLocation)); final MessageType parquetSchema = new AvroSchemaConverter().convert(avroSchema); final WriteSupport<Pojo> writeSupport = new AvroWriteSupport(parquetSchema, avroSchema); final String parquetFile = "hdfs://<hostname>:<port>/tmp/parquet/data.parquet"; final Path path = new Path(parquetFile); ParquetWriter<GenericRecord> parquetWriter = new ParquetWriter(path, writeSupport, CompressionCodecName.SNAPPY, BLOCK_SIZE, PAGE_SIZE); final GenericRecord record = new GenericData.Record(avroSchema); record.put("id", 1); record.put("age", 10); record.put("name", "ABC"); record.put("place", "BCD"); parquetWriter.write(record); parquetWriter.close(); }
/** * Returns whether the given type is the element type of a list or is a * synthetic group with one field that is the element type. This is * determined by checking whether the type can be a synthetic group and by * checking whether a potential synthetic group matches the expected schema. * <p> * Unlike {@link AvroSchemaConverter#isElementType(Type, String)}, this * method never guesses because the expected schema is known. * * @param repeatedType a type that may be the element type * @param elementSchema the expected Schema for list elements * @return {@code true} if the repeatedType is the element schema */ static boolean isElementType(Type repeatedType, Schema elementSchema) { if (repeatedType.isPrimitive() || repeatedType.asGroupType().getFieldCount() > 1 || repeatedType.asGroupType().getType(0).isRepetition(REPEATED)) { // The repeated type must be the element type because it is an invalid // synthetic wrapper. Must be a group with one optional or required field return true; } else if (elementSchema != null && elementSchema.getType() == Schema.Type.RECORD) { Schema schemaFromRepeated = CONVERTER.convert(repeatedType.asGroupType()); if (checkReaderWriterCompatibility(elementSchema, schemaFromRepeated) .getType() == COMPATIBLE) { return true; } } return false; }
LogicalType logicalType = convertOriginalType( annotation, asPrimitive.getDecimalMetadata()); if (logicalType != null && (annotation != DECIMAL || throw new UnsupportedOperationException("Invalid list type " + parquetGroupType); if (isElementType(repeatedType, parquetGroupType.getName())) { return Schema.createArray(convertField(repeatedType)); } else { Type elementType = repeatedType.asGroupType().getType(0); if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { return Schema.createArray(optional(convertField(elementType))); } else { return Schema.createArray(convertField(elementType)); return Schema.createMap(optional(convertField(valueType))); } else { return Schema.createMap(convertField(valueType)); return convertFields(parquetGroupType.getName(), parquetGroupType.getFields());
builder = Types.primitive(BINARY, repetition).as(UTF8); } else if (type.equals(Schema.Type.RECORD)) { return new GroupType(repetition, fieldName, convertFields(schema.getFields())); } else if (type.equals(Schema.Type.ENUM)) { builder = Types.primitive(BINARY, repetition).as(ENUM); if (writeOldListStructure) { return ConversionPatterns.listType(repetition, fieldName, convertField("array", schema.getElementType(), REPEATED)); } else { return ConversionPatterns.listOfElements(repetition, fieldName, convertField(AvroWriteSupport.LIST_ELEMENT_NAME, schema.getElementType())); Type valType = convertField("value", schema.getValueType()); .length(schema.getFixedSize()); } else if (type.equals(Schema.Type.UNION)) { return convertUnion(fieldName, schema, repetition); } else { throw new UnsupportedOperationException("Cannot convert Avro type " + type); OriginalType annotation = convertLogicalType(logicalType); if (annotation != null) { builder.as(annotation);
private Schema convertFields(String name, List<Type> parquetFields) { List<Schema.Field> fields = new ArrayList<Schema.Field>(); for (Type parquetType : parquetFields) { Schema fieldSchema = convertField(parquetType); if (parquetType.isRepetition(REPEATED)) { throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType); } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) { fields.add(new Schema.Field( parquetType.getName(), optional(fieldSchema), null, NULL_VALUE)); } else { // REQUIRED fields.add(new Schema.Field( parquetType.getName(), fieldSchema, null, (Object) null)); } } Schema schema = Schema.createRecord(name, null, null, false); schema.setFields(fields); return schema; }
private Type convertField(String fieldName, Schema schema) { return convertField(fieldName, schema, Type.Repetition.REQUIRED); }
public MessageType convert(Schema avroSchema) { if (!avroSchema.getType().equals(Schema.Type.RECORD)) { throw new IllegalArgumentException("Avro schema must be a record."); } return new MessageType(avroSchema.getFullName(), convertFields(avroSchema.getFields())); }
{ final String schemaLocation = "/tmp/avro_format.json"; final Schema avroSchema = new Schema.Parser().parse(new File(schemaLocation)); final MessageType parquetSchema = new AvroSchemaConverter().convert(avroSchema); final WriteSupport<Pojo> writeSupport = new AvroWriteSupport(parquetSchema, avroSchema); final String parquetFile = "hdfs://<hostname>:<port>/tmp/parquet/data.parquet"; final Path path = new Path(parquetFile); ParquetWriter<GenericRecord> parquetWriter = new ParquetWriter(path, writeSupport, CompressionCodecName.SNAPPY, BLOCK_SIZE, PAGE_SIZE); final GenericRecord record = new GenericData.Record(avroSchema); record.put("id", 1); record.put("age", 10); record.put("name", "ABC"); record.put("place", "BCD"); parquetWriter.write(record); parquetWriter.close();
private Type convertField(Schema.Field field) { return convertField(field.name(), field.schema()); }
public Schema convert(MessageType parquetSchema) { return convertFields(parquetSchema.getName(), parquetSchema.getFields()); }
private static <T> WriteSupport<T> writeSupport(Configuration conf, Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model); }
private List<Type> convertFields(List<Schema.Field> fields) { List<Type> types = new ArrayList<Type>(); for (Schema.Field field : fields) { if (field.schema().getType().equals(Schema.Type.NULL)) { continue; // Avro nulls are not encoded, unless they are null unions } types.add(convertField(field)); } return types; }
Schema convert(GroupType parquetSchema) { return convertFields(parquetSchema.getName(), parquetSchema.getFields()); }
private static <T> WriteSupport<T> writeSupport(Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter().convert(avroSchema), avroSchema, model); }
private Type convertUnion(String fieldName, Schema schema, Type.Repetition repetition) { List<Schema> nonNullSchemas = new ArrayList<Schema>(schema.getTypes().size()); for (Schema childSchema : schema.getTypes()) { if (childSchema.getType().equals(Schema.Type.NULL)) { if (Type.Repetition.REQUIRED == repetition) { repetition = Type.Repetition.OPTIONAL; } } else { nonNullSchemas.add(childSchema); } } // If we only get a null and one other type then its a simple optional field // otherwise construct a union container switch (nonNullSchemas.size()) { case 0: throw new UnsupportedOperationException("Cannot convert Avro union of only nulls"); case 1: return convertField(fieldName, nonNullSchemas.get(0), repetition); default: // complex union type List<Type> unionTypes = new ArrayList<Type>(nonNullSchemas.size()); int index = 0; for (Schema childSchema : nonNullSchemas) { unionTypes.add( convertField("member" + index++, childSchema, Type.Repetition.OPTIONAL)); } return new GroupType(repetition, fieldName, unionTypes); } }
public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath)); }
@Override public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) { MessageType projection = fileSchema; Map<String, String> metadata = new LinkedHashMap<String, String>(); String requestedProjectionString = configuration.get(AVRO_REQUESTED_PROJECTION); if (requestedProjectionString != null) { Schema avroRequestedProjection = new Schema.Parser().parse(requestedProjectionString); projection = new AvroSchemaConverter(configuration).convert(avroRequestedProjection); } String avroReadSchema = configuration.get(AVRO_READ_SCHEMA); if (avroReadSchema != null) { metadata.put(AVRO_READ_SCHEMA_METADATA_KEY, avroReadSchema); } if (configuration.getBoolean(AVRO_COMPATIBILITY, AVRO_DEFAULT_COMPATIBILITY)) { metadata.put(AVRO_COMPATIBILITY, "true"); } return new ReadContext(projection, metadata); }
@Override public WriteContext init(Configuration configuration) { if (rootAvroSchema == null) { this.rootAvroSchema = new Schema.Parser().parse(configuration.get(AVRO_SCHEMA)); this.rootSchema = new AvroSchemaConverter().convert(rootAvroSchema); } if (model == null) { this.model = getDataModel(configuration); } boolean writeOldListStructure = configuration.getBoolean( WRITE_OLD_LIST_STRUCTURE, WRITE_OLD_LIST_STRUCTURE_DEFAULT); if (writeOldListStructure) { this.listWriter = new TwoLevelListWriter(); } else { this.listWriter = new ThreeLevelListWriter(); } Map<String, String> extraMetaData = new HashMap<String, String>(); extraMetaData.put(AvroReadSupport.AVRO_SCHEMA_METADATA_KEY, rootAvroSchema.toString()); return new WriteContext(rootSchema, extraMetaData); }
@Override public RecordMaterializer<T> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext) { Map<String, String> metadata = readContext.getReadSupportMetadata(); MessageType parquetSchema = readContext.getRequestedSchema(); Schema avroSchema; if (metadata.get(AVRO_READ_SCHEMA_METADATA_KEY) != null) { // use the Avro read schema provided by the user avroSchema = new Schema.Parser().parse(metadata.get(AVRO_READ_SCHEMA_METADATA_KEY)); } else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) { // use the Avro schema from the file metadata if present avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY)); } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) { // use the Avro schema from the file metadata if present avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY)); } else { // default to converting the Parquet schema into an Avro schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema); } GenericData model = getDataModel(configuration); String compatEnabled = metadata.get(AvroReadSupport.AVRO_COMPATIBILITY); if (compatEnabled != null && Boolean.valueOf(compatEnabled)) { return newCompatMaterializer(parquetSchema, avroSchema, model); } return new AvroRecordMaterializer<T>(parquetSchema, avroSchema, model); }