@Override public RecordMaterializer<GenericRecord> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext ) { // coercing this value to false by default here to be friendlier default behavior // see https://github.com/apache/incubator-druid/issues/5433#issuecomment-388539306 String jobProp = "parquet.avro.add-list-element-records"; Boolean explicitlySet = configuration.getBoolean(jobProp, false); if (!explicitlySet) { configuration.setBoolean(jobProp, false); } MessageType parquetSchema = readContext.getRequestedSchema(); Schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema); Class<? extends AvroDataSupplier> suppClass = configuration.getClass( AVRO_DATA_SUPPLIER, SpecificDataSupplier.class, AvroDataSupplier.class ); AvroDataSupplier supplier = ReflectionUtils.newInstance(suppClass, configuration); return new AvroRecordMaterializer<>(parquetSchema, avroSchema, supplier.get()); } }
@Override public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) { MessageType projection = fileSchema; Map<String, String> metadata = new LinkedHashMap<String, String>(); String requestedProjectionString = configuration.get(AVRO_REQUESTED_PROJECTION); if (requestedProjectionString != null) { Schema avroRequestedProjection = new Schema.Parser().parse(requestedProjectionString); projection = new AvroSchemaConverter(configuration).convert(avroRequestedProjection); } String avroReadSchema = configuration.get(AVRO_READ_SCHEMA); if (avroReadSchema != null) { metadata.put(AVRO_READ_SCHEMA_METADATA_KEY, avroReadSchema); } if (configuration.getBoolean(AVRO_COMPATIBILITY, AVRO_DEFAULT_COMPATIBILITY)) { metadata.put(AVRO_COMPATIBILITY, "true"); } return new ReadContext(projection, metadata); }
public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath)); }
@Override public WriteContext init(Configuration configuration) { if (rootAvroSchema == null) { this.rootAvroSchema = new Schema.Parser().parse(configuration.get(AVRO_SCHEMA)); this.rootSchema = new AvroSchemaConverter().convert(rootAvroSchema); } if (model == null) { this.model = getDataModel(configuration); } boolean writeOldListStructure = configuration.getBoolean( WRITE_OLD_LIST_STRUCTURE, WRITE_OLD_LIST_STRUCTURE_DEFAULT); if (writeOldListStructure) { this.listWriter = new TwoLevelListWriter(); } else { this.listWriter = new ThreeLevelListWriter(); } Map<String, String> extraMetaData = new HashMap<String, String>(); extraMetaData.put(AvroReadSupport.AVRO_SCHEMA_METADATA_KEY, rootAvroSchema.toString()); return new WriteContext(rootSchema, extraMetaData); }
@Override public RecordMaterializer<T> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext) { Map<String, String> metadata = readContext.getReadSupportMetadata(); MessageType parquetSchema = readContext.getRequestedSchema(); Schema avroSchema; if (metadata.get(AVRO_READ_SCHEMA_METADATA_KEY) != null) { // use the Avro read schema provided by the user avroSchema = new Schema.Parser().parse(metadata.get(AVRO_READ_SCHEMA_METADATA_KEY)); } else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) { // use the Avro schema from the file metadata if present avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY)); } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) { // use the Avro schema from the file metadata if present avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY)); } else { // default to converting the Parquet schema into an Avro schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema); } GenericData model = getDataModel(configuration); String compatEnabled = metadata.get(AvroReadSupport.AVRO_COMPATIBILITY); if (compatEnabled != null && Boolean.valueOf(compatEnabled)) { return newCompatMaterializer(parquetSchema, avroSchema, model); } return new AvroRecordMaterializer<T>(parquetSchema, avroSchema, model); }
private static <T> WriteSupport<T> writeSupport(Configuration conf, Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model); }
private static <T> WriteSupport<T> writeSupport(Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter().convert(avroSchema), avroSchema, model); }
public static Schema fromParquet(FileSystem fs, Path location) throws IOException { ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), location); String schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("parquet.avro.schema"); if (schemaString == null) { // try the older property schemaString = footer.getFileMetaData() .getKeyValueMetaData().get("avro.schema"); } if (schemaString != null) { return new Schema.Parser().parse(schemaString); } else { return new AvroSchemaConverter() .convert(footer.getFileMetaData().getSchema()); } }
return new AvroSchemaConverter(conf).convert(resSchema);
/** * Returns whether the given type is the element type of a list or is a * synthetic group with one field that is the element type. This is * determined by checking whether the type can be a synthetic group and by * checking whether a potential synthetic group matches the expected schema. * <p> * Unlike {@link AvroSchemaConverter#isElementType(Type, String)}, this * method never guesses because the expected schema is known. * * @param repeatedType a type that may be the element type * @param elementSchema the expected Schema for list elements * @return {@code true} if the repeatedType is the element schema */ static boolean isElementType(Type repeatedType, Schema elementSchema) { if (repeatedType.isPrimitive() || repeatedType.asGroupType().getFieldCount() > 1 || repeatedType.asGroupType().getType(0).isRepetition(REPEATED)) { // The repeated type must be the element type because it is an invalid // synthetic wrapper. Must be a group with one optional or required field return true; } else if (elementSchema != null && elementSchema.getType() == Schema.Type.RECORD) { Schema schemaFromRepeated = CONVERTER.convert(repeatedType.asGroupType()); if (checkReaderWriterCompatibility(elementSchema, schemaFromRepeated) .getType() == COMPATIBLE) { return true; } } return false; }
private void writeParquetFile(String filePath, List<String> rowKeys) throws Exception { // Write out a parquet file Schema schema = HoodieAvroUtils.getRecordKeySchema(); BloomFilter filter = new BloomFilter(1000, 0.0001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); for (String rowKey : rowKeys) { GenericRecord rec = new GenericData.Record(schema); rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); writer.write(rec); filter.add(rowKey); } writer.close(); } }
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(String commitTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) throws IOException { BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( new AvroSchemaConverter().convert(schema), schema, filter); HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(), config.getParquetCompressionRatio()); return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); } }
@SuppressWarnings({"unchecked", "deprecation"}) private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException, InterruptedException { Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema); BloomFilter filter = new BloomFilter(1000, 0.0001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter); ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf()); List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil .generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); testRecords.forEach(s -> { try { writer.write(s); } catch (IOException e) { fail("IOException while writing test records as parquet" + e.toString()); } }); writer.close(); }
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(String commitTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable) throws IOException { BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( new AvroSchemaConverter().convert(schema), schema, filter); HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(), config.getParquetCompressionRatio()); return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); } }
filter = new BloomFilter(10000, 0.0000001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); String commitTime = FSUtils.getCommitTime(filename);