org.apache.parquet.hadoop.api.ReadSupport java code examples

Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();

@Override
public RecordMaterializer<T> prepareForRead(
  Configuration configuration,
  Map<String, String> keyValueMetaData,
  MessageType fileSchema,
  ReadSupport.ReadContext readContext) {
 return delegate.prepareForRead(configuration, keyValueMetaData, fileSchema, readContext);
}

/**
 * attempts to validate and construct a {@link MessageType} from a read projection schema
 *
 * @param fileMessageType         the typed schema of the source
 * @param partialReadSchemaString the requested projection schema
 * @return the typed schema that should be used to read
 */
public static MessageType getSchemaForRead(MessageType fileMessageType, String partialReadSchemaString) {
 if (partialReadSchemaString == null)
  return fileMessageType;
 MessageType requestedMessageType = MessageTypeParser.parseMessageType(partialReadSchemaString);
 return getSchemaForRead(fileMessageType, requestedMessageType);
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();

@Override
public RecordMaterializer<T> prepareForRead(
  Configuration configuration,
  Map<String, String> keyValueMetaData,
  MessageType fileSchema,
  ReadSupport.ReadContext readContext) {
 return delegate.prepareForRead(configuration, keyValueMetaData, fileSchema, readContext);
}

/**
 * attempts to validate and construct a {@link MessageType} from a read projection schema
 *
 * @param fileMessageType         the typed schema of the source
 * @param partialReadSchemaString the requested projection schema
 * @return the typed schema that should be used to read
 */
public static MessageType getSchemaForRead(MessageType fileMessageType, String partialReadSchemaString) {
 if (partialReadSchemaString == null)
  return fileMessageType;
 MessageType requestedMessageType = MessageTypeParser.parseMessageType(partialReadSchemaString);
 return getSchemaForRead(fileMessageType, requestedMessageType);
}

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData();
ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();

@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration,
                      Map<String, String> fileMetadata,
                      MessageType fileMessageType,
                      ReadContext readContext) {
 // This is the type created in init that was based on the file's schema. The schema that this
 // will pass to the wrapped ReadSupport needs to match the expected schema's names. Rather than
 // renaming the file's schema, convert the expected schema to Parquet. This relies on writing
 // files with the correct schema.
 // TODO: this breaks when columns are reordered.
 MessageType readSchema = ParquetSchemaUtil.convert(expectedSchema, fileMessageType.getName());
 return wrapped.prepareForRead(configuration, fileMetadata, readSchema, readContext);
}

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

@Override
public ReadSupport.ReadContext init(InitContext context) {
 return delegate.init(context);
}

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

@Override
public ReadSupport.ReadContext init(InitContext context) {
 return delegate.init(context);
}

/**
 * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
 *
 * @param context the initialisation context
 * @return the readContext that defines how to read the file
 */
public ReadContext init(InitContext context) {
 return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
}

/**
 * called in {@link org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} in the front end
 *
 * @param context the initialisation context
 * @return the readContext that defines how to read the file
 */
public ReadContext init(InitContext context) {
 return init(context.getConfiguration(), context.getMergedKeyValueMetaData(), context.getFileSchema());
}

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
 boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
 final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
 if (maxSplitSize < 0 || minSplitSize < 0) {
  throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
 }
 GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
 ReadContext readContext = getReadSupport(configuration).init(new InitContext(
   configuration,
   globalMetaData.getKeyValueMetaData(),
   globalMetaData.getSchema()));
 return new ClientSideMetadataSplitStrategy().getSplits(
   configuration, footers, maxSplitSize, minSplitSize, readContext);
}

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
 boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
 final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
 if (maxSplitSize < 0 || minSplitSize < 0) {
  throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
 }
 GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
 ReadContext readContext = getReadSupport(configuration).init(new InitContext(
   configuration,
   globalMetaData.getKeyValueMetaData(),
   globalMetaData.getSchema()));
 return new ClientSideMetadataSplitStrategy().getSplits(
   configuration, footers, maxSplitSize, minSplitSize, readContext);
}

Javadoc

Abstraction used by the org.apache.parquet.hadoop.ParquetInputFormat to materialize records

Most used methods

init
called in org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
prepareForRead
called in org.apache.hadoop.mapreduce.RecordReader#initialize(org.apache.hadoop.mapreduce.InputSplit
getSchemaForRead

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (Timer)
getExternalFilesDir (Context)
findViewById (Activity)
String (java.lang)
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
SocketTimeoutException (java.net)
This exception is thrown when a timeout expired on a socket read or accept operation.
MessageDigest (java.security)
Uses a one-way hash function to turn an arbitrary number of bytes into a fixed-length byte sequence.
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Best plugins for Eclipse

How to useReadSupport in org.apache.parquet.hadoop.api

Best Java code snippets using org.apache.parquet.hadoop.api.ReadSupport (Showing top 20 results out of 315)

How to use
ReadSupport
in
org.apache.parquet.hadoop.api