private static <T> ParquetWriter<T> createAvroParquetWriter( String schemaString, GenericData dataModel, OutputFile out) throws IOException { final Schema schema = new Schema.Parser().parse(schemaString); return AvroParquetWriter.<T>builder(out) .withSchema(schema) .withDataModel(dataModel) .build(); }
@Override public RecordMaterializer<GenericRecord> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext ) { // coercing this value to false by default here to be friendlier default behavior // see https://github.com/apache/incubator-druid/issues/5433#issuecomment-388539306 String jobProp = "parquet.avro.add-list-element-records"; Boolean explicitlySet = configuration.getBoolean(jobProp, false); if (!explicitlySet) { configuration.setBoolean(jobProp, false); } MessageType parquetSchema = readContext.getRequestedSchema(); Schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema); Class<? extends AvroDataSupplier> suppClass = configuration.getClass( AVRO_DATA_SUPPLIER, SpecificDataSupplier.class, AvroDataSupplier.class ); AvroDataSupplier supplier = ReflectionUtils.newInstance(suppClass, configuration); return new AvroRecordMaterializer<>(parquetSchema, avroSchema, supplier.get()); } }
@Override public ReadContext init(InitContext context) { MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), getPartialReadSchema(context)); return new ReadContext(requestedProjection); }
private static <T> WriteSupport<T> writeSupport(Configuration conf, Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter(conf).convert(avroSchema), avroSchema, model); }
private ParquetWriter createParquetWriter(final ProcessContext context, final FlowFile flowFile, final OutputStream out, final Schema schema) throws IOException { NifiParquetOutputFile nifiParquetOutputFile = new NifiParquetOutputFile(out); final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter .<GenericRecord>builder(nifiParquetOutputFile) .withSchema(schema); Configuration conf = new Configuration(); conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, true); conf.setBoolean("parquet.avro.add-list-element-records", false); conf.setBoolean("parquet.avro.write-old-list-structure", false); ParquetUtils.applyCommonConfig(parquetWriter, context, flowFile, conf, this); return parquetWriter.build(); }
public AvroParquetFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException { Path path = new Path(logFilePath.getLogFilePath()); LOG.debug("Creating Brand new Writer for path {}", path); CompressionCodecName codecName = CompressionCodecName .fromCompressionCodec(codec != null ? codec.getClass() : null); topic = logFilePath.getTopic(); // Not setting blockSize, pageSize, enableDictionary, and validating writer = AvroParquetWriter.builder(path) .withSchema(schemaRegistryClient.getSchema(topic)) .withCompressionCodec(codecName) .build(); }
@Override public HDFSRecordReader createHDFSRecordReader(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path) throws IOException { final ParquetReader.Builder<GenericRecord> readerBuilder = AvroParquetReader.<GenericRecord>builder(path).withConf(conf); return new AvroParquetHDFSRecordReader(readerBuilder.build()); }
public AvroParquetFileReader(LogFilePath logFilePath, CompressionCodec codec) throws IOException { Path path = new Path(logFilePath.getLogFilePath()); String topic = logFilePath.getTopic(); Schema schema = schemaRegistryClient.getSchema(topic); reader = AvroParquetReader.<GenericRecord>builder(path).build(); writer = new SpecificDatumWriter(schema); offset = logFilePath.getOffset(); }
private static Converter newStringConverter(Schema schema, GenericData model, ParentValueContainer parent) { Class<?> stringableClass = getStringableClass(schema, model); if (stringableClass == String.class) { return new FieldStringConverter(parent); } else if (stringableClass == CharSequence.class) { return new AvroConverters.FieldUTF8Converter(parent); } return new FieldStringableConverter(parent, stringableClass); }
@Override public void append(E entity) throws IOException { avroParquetWriter.write(entity); }
@Override public void end() { fillInDefaults(); if (parent != null) { parent.add(currentRecord); } }
@Override public void end() { fillInDefaults(); if (parent != null) { parent.add(currentRecord); } else { // this applies any converters needed for the root value rootContainer.add(currentRecord); } }
@Override final public void addBoolean(boolean value) { parent.addBoolean(value); } }
@Override final public void addLong(long value) { parent.addLong(value); } }
@Override final public void addDouble(double value) { parent.addDouble(value); } }
@SuppressWarnings("unchecked") private static <T> RecordMaterializer<T> newCompatMaterializer( MessageType parquetSchema, Schema avroSchema, GenericData model) { return (RecordMaterializer<T>) new AvroCompatRecordMaterializer( parquetSchema, avroSchema, model); }
@Override public void addInt(int value) { parent.addChar((char) value); } }
private static <T> WriteSupport<T> writeSupport(Schema avroSchema, GenericData model) { return new AvroWriteSupport<T>( new AvroSchemaConverter().convert(avroSchema), avroSchema, model); }
@Override public HDFSRecordWriter createHDFSRecordWriter(final ProcessContext context, final FlowFile flowFile, final Configuration conf, final Path path, final RecordSchema schema) throws IOException, SchemaNotFoundException { final Schema avroSchema = AvroTypeUtil.extractAvroSchema(schema); final AvroParquetWriter.Builder<GenericRecord> parquetWriter = AvroParquetWriter .<GenericRecord>builder(path) .withSchema(avroSchema); ParquetUtils.applyCommonConfig(parquetWriter, context, flowFile, conf, this); return new AvroParquetHDFSRecordWriter(parquetWriter.build(), avroSchema); }