private ParquetReader<GenericRecord> initReader() throws IOException { Configuration configuration = getFs().getConf(); if (this.schema != null) { AvroReadSupport.setAvroReadSchema(configuration, this.schema); } if (this.projection != null) { AvroReadSupport.setRequestedProjection(configuration, this.projection); } ParquetReader reader = AvroParquetReader.<GenericRecord>builder(getFilePath()) .withConf(configuration).build(); return reader; }
/** * @param conf a configuration * @param file a file path * @param unboundRecordFilter an unbound record filter (from the old filter API) * @throws IOException if there is an error while reading * @deprecated will be removed in 2.0.0; use {@link #builder(InputFile)} instead. */ @Deprecated public AvroParquetReader(Configuration conf, Path file, UnboundRecordFilter unboundRecordFilter) throws IOException { super(conf, file, new AvroReadSupport<T>(), unboundRecordFilter); }
@Override public RecordMaterializer<T> prepareForRead( Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema, ReadContext readContext) { Map<String, String> metadata = readContext.getReadSupportMetadata(); MessageType parquetSchema = readContext.getRequestedSchema(); Schema avroSchema; if (metadata.get(AVRO_READ_SCHEMA_METADATA_KEY) != null) { // use the Avro read schema provided by the user avroSchema = new Schema.Parser().parse(metadata.get(AVRO_READ_SCHEMA_METADATA_KEY)); } else if (keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY) != null) { // use the Avro schema from the file metadata if present avroSchema = new Schema.Parser().parse(keyValueMetaData.get(AVRO_SCHEMA_METADATA_KEY)); } else if (keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY) != null) { // use the Avro schema from the file metadata if present avroSchema = new Schema.Parser().parse(keyValueMetaData.get(OLD_AVRO_SCHEMA_METADATA_KEY)); } else { // default to converting the Parquet schema into an Avro schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema); } GenericData model = getDataModel(configuration); String compatEnabled = metadata.get(AvroReadSupport.AVRO_COMPATIBILITY); if (compatEnabled != null && Boolean.valueOf(compatEnabled)) { return newCompatMaterializer(parquetSchema, avroSchema, model); } return new AvroRecordMaterializer<T>(parquetSchema, avroSchema, model); }
/** * Override the Avro schema to use for reading. If not set, the Avro schema used for * writing is used. * <p> * Differences between the read and write schemas are resolved using * <a href="http://avro.apache.org/docs/current/spec.html#Schema+Resolution">Avro's schema resolution rules</a>. * @param job a job * @param avroReadSchema the requested schema * @see #setRequestedProjection(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema) * @see org.apache.parquet.avro.AvroParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema) */ public static void setAvroReadSchema(Job job, Schema avroReadSchema) { AvroReadSupport.setAvroReadSchema(ContextUtil.getConfiguration(job), avroReadSchema); }
/** * Set the subset of columns to read (projection pushdown). Specified as an Avro * schema, the requested projection is converted into a Parquet schema for Parquet * column projection. * <p> * This is useful if the full schema is large and you only want to read a few * columns, since it saves time by not reading unused columns. * <p> * If a requested projection is set, then the Avro schema used for reading * must be compatible with the projection. For instance, if a column is not included * in the projection then it must either not be included or be optional in the read * schema. Use {@link #setAvroReadSchema(org.apache.hadoop.mapreduce.Job, * org.apache.avro.Schema)} to set a read schema, if needed. * @param job a job * @param requestedProjection the requested projection schema * @see #setAvroReadSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema) * @see org.apache.parquet.avro.AvroParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema) */ public static void setRequestedProjection(Job job, Schema requestedProjection) { AvroReadSupport.setRequestedProjection(ContextUtil.getConfiguration(job), requestedProjection); }
@Override public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) { if (avroType != null) { setRequestedProjection(configuration, avroType.getSchema()); } return super.init(configuration, keyValueMetaData, fileSchema); } }
private static void setConfigProperties(Configuration conf, Format format, Schema schema, Class<?> type) { GenericData model = DataModelUtil.getDataModelForType(type); if (Formats.AVRO.equals(format)) { setModel.invoke(conf, model.getClass()); conf.set(AVRO_SCHEMA_INPUT_KEY, schema.toString()); } else if (Formats.PARQUET.equals(format)) { // TODO: update to a version of Parquet with setAvroDataSupplier //AvroReadSupport.setAvroDataSupplier(conf, // DataModelUtil.supplierClassFor(model)); AvroReadSupport.setAvroReadSchema(conf, schema); } }
try { Schema chukwaAvroSchema = ChukwaAvroSchema.getSchema(); AvroReadSupport.setRequestedProjection(conf, chukwaAvroSchema); reader = new AvroParquetReader<GenericRecord>(conf, new Path(dataSinkFile));
conf.addResource(getFs(filePath.toString(), conf).getConf()); Schema readSchema = HoodieAvroUtils.getRecordKeySchema(); AvroReadSupport.setAvroReadSchema(conf, readSchema); AvroReadSupport.setRequestedProjection(conf, readSchema); Set<String> rowKeys = new HashSet<>(); try (ParquetReader reader = AvroParquetReader.builder(filePath).withConf(conf).build()) {
@Override public void initialize() { Preconditions.checkState(state.equals(ReaderWriterState.NEW), "A reader may not be opened more than once - current state:%s", state); LOG.debug("Opening reader on path:{}", path); try { final Configuration conf = fileSystem.getConf(); AvroReadSupport.setAvroReadSchema(conf, readerSchema); reader = new AvroParquetReader<E>( conf, fileSystem.makeQualified(path)); } catch (IOException e) { throw new DatasetIOException("Unable to create reader path:" + path, e); } advance(); state = ReaderWriterState.OPEN; }
/** * @param file a file path * @throws IOException if there is an error while reading * @deprecated will be removed in 2.0.0; use {@link #builder(InputFile)} instead. */ @Deprecated public AvroParquetReader(Path file) throws IOException { super(file, new AvroReadSupport<T>()); }
AvroReadSupport.setRequestedProjection(conf, projectionSchema); AvroReadSupport.setAvroReadSchema(conf, readerSchema);
.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
/** * @param file a file path * @param unboundRecordFilter an unbound record filter (from the old filter API) * @throws IOException if there is an error while reading * @deprecated will be removed in 2.0.0; use {@link #builder(InputFile)} instead. */ @Deprecated public AvroParquetReader(Path file, UnboundRecordFilter unboundRecordFilter) throws IOException { super(file, new AvroReadSupport<T>(), unboundRecordFilter); }
AvroReadSupport.setRequestedProjection(configuration, AvroSchemaUtil.convert(expectedSchema, projection.getName())); org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection( AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()), expectedSchema, ImmutableMap.of()); AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));
.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
/** * @param conf a configuration * @param file a file path * @throws IOException if there is an error while reading * @deprecated will be removed in 2.0.0; use {@link #builder(InputFile)} instead. */ @Deprecated public AvroParquetReader(Configuration conf, Path file) throws IOException { super(conf, file, new AvroReadSupport<T>()); }
"Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId); } else { AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema()); BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null; try (ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath())
@Override protected ReadSupport<T> getReadSupport() { if (isReflect) { conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false); } else { conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, enableCompatibility); } return new AvroReadSupport<T>(model); } }
"Error in finding the old file path at commit " + commitTime + " for fileId: " + fileId); } else { AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema()); ParquetReader<IndexedRecord> reader = AvroParquetReader.builder(upsertHandle.getOldFilePath()) .withConf(getHadoopConf()).build();