org.apache.parquet.hadoop.api java code examples

@Override
public ReadContext init(InitContext context)
{
 MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), getPartialReadSchema(context));
 return new ReadContext(requestedProjection);
}

@Override
public WriteContext init(Configuration configuration) {
 return new WriteContext(type, metadata);
}

/**
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param context
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
 Configuration configuration = context.getConfiguration();
 MessageType fileSchema = context.getFileSchema();
 String columnNames = configuration.get(IOConstants.COLUMNS);
 Map<String, String> contextMetadata = new HashMap<String, String>();
 boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
 if (columnNames != null) {
  List<String> columnNamesList = getColumnNames(columnNames);
  String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
  List<TypeInfo> columnTypesList = getColumnTypes(columnTypes);
  MessageType tableSchema =
   getRequestedSchemaForIndexAccess(indexAccess, columnNamesList, columnTypesList, fileSchema);
  contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, tableSchema.toString());
  contextMetadata.put(PARQUET_COLUMN_INDEX_ACCESS, String.valueOf(indexAccess));
  this.hiveTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList);
  return new ReadContext(getRequestedPrunedSchema(columnNamesList, tableSchema, configuration),
   contextMetadata);
 } else {
  contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, fileSchema.toString());
  return new ReadContext(fileSchema, contextMetadata);
 }
}

Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =
  configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA());

/**
 * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
 *
 * @param context the initialisation context
 * @return the readContext that defines how to read the file
 */
public ReadContext init(InitContext context) {
 return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
}

  readContext = new DataWritableReadSupport().init(new InitContext(jobConf,
  null, fileMetaData.getSchema()));
 schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata()
  .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
 final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
  oldSplit.getLocations(),
  filtedBlocks,
  readContext.getRequestedSchema().toString(),
  fileMetaData.getSchema().toString(),
  fileMetaData.getKeyValueMetaData(),
  readContext.getReadSupportMetadata());
 return split;
} else {

 /**
  *
  * It creates the hive read support to interpret data from parquet to hive
  *
  * @param configuration // unused
  * @param keyValueMetaData
  * @param fileSchema // unused
  * @param readContext containing the requested schema and the schema of the hive table
  * @return Record Materialize for Hive
  */
 @Override
 public RecordMaterializer<ArrayWritable> prepareForRead(final Configuration configuration,
   final Map<String, String> keyValueMetaData, final MessageType fileSchema,
     final org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  final Map<String, String> metadata = readContext.getReadSupportMetadata();
  if (metadata == null) {
   throw new IllegalStateException("ReadContext not initialized properly. " +
    "Don't know the Hive Schema.");
  }
  String key = HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION.varname;
  if (!metadata.containsKey(key)) {
   metadata.put(key, String.valueOf(HiveConf.getBoolVar(
    configuration, HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION)));
  }
  return new DataWritableRecordConverter(readContext.getRequestedSchema(), metadata, hiveTypeInfo);
 }
}

MessageType fullSchema = context.getFileSchema();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();

 @Override
 public RecordMaterializer<GenericRecord> prepareForRead(
   Configuration configuration,
   Map<String, String> keyValueMetaData,
   MessageType fileSchema,
   ReadContext readContext
 )
 {
  // coercing this value to false by default here to be friendlier default behavior
  // see https://github.com/apache/incubator-druid/issues/5433#issuecomment-388539306
  String jobProp = "parquet.avro.add-list-element-records";
  Boolean explicitlySet = configuration.getBoolean(jobProp, false);
  if (!explicitlySet) {
   configuration.setBoolean(jobProp, false);
  }
  MessageType parquetSchema = readContext.getRequestedSchema();
  Schema avroSchema = new AvroSchemaConverter(configuration).convert(parquetSchema);

  Class<? extends AvroDataSupplier> suppClass = configuration.getClass(
    AVRO_DATA_SUPPLIER,
    SpecificDataSupplier.class,
    AvroDataSupplier.class
  );
  AvroDataSupplier supplier = ReflectionUtils.newInstance(suppClass, configuration);
  return new AvroRecordMaterializer<>(parquetSchema, avroSchema, supplier.get());
 }
}

@Override
public RecordMaterializer<T> prepareForRead(
  Configuration configuration,
  Map<String, String> keyValueMetaData,
  MessageType fileSchema,
  ReadSupport.ReadContext readContext) {
 return delegate.prepareForRead(configuration, keyValueMetaData, fileSchema, readContext);
}

@Override
public void prepareForWrite(RecordConsumer recordConsumer) {
 delegate.prepareForWrite(recordConsumer);
}

Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =
  configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA());

 @Override
 public ReadContext init(InitContext context)
 {
  MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), getPartialReadSchema(context));
  return new ReadContext(requestedProjection);
 }
}

/**
 * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
 *
 * @param context the initialisation context
 * @return the readContext that defines how to read the file
 */
public ReadContext init(InitContext context) {
 return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
}

@Override
public WriteContext init(final Configuration configuration) {
 schema = getSchema(configuration);
 return new WriteContext(schema, new HashMap<String, String>());
}

MessageType fullSchema = context.getFileSchema();
HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromConfiguration(context.getConfiguration());
ParseSpec parseSpec = config.getParser().getParseSpec();

@Override
public RecordMaterializer<T> prepareForRead(
  Configuration configuration,
  Map<String, String> keyValueMetaData,
  MessageType fileSchema,
  ReadSupport.ReadContext readContext) {
 return delegate.prepareForRead(configuration, keyValueMetaData, fileSchema, readContext);
}

Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =
  configuration.get(ParquetReadSupport$.MODULE$.SPARK_ROW_REQUESTED_SCHEMA());

  @Override
  public ReadContext init(final InitContext context) {
    return new ReadContext(context.getFileSchema());
  }
}

@Override
public ReadContext init(InitContext context) {
 return new ReadContext(context.getFileSchema());
}

How to use org.apache.parquet.hadoop.api

Best Java code snippets using org.apache.parquet.hadoop.api (Showing top 20 results out of 315)