org.apache.parquet.io.ColumnIOFactory java code examples

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadDecisionTreeModel(String pathToMdl) {
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    final Map<Integer, NodeData> nodes = new TreeMap<>();
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        NodeData nodeData = extractNodeDataFromParquetRow(g);
        nodes.put(nodeData.id, nodeData);
      }
    }
    return buildDecisionTreeModel(nodes);
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return null;
}

/**
 * @param requestedSchema the requestedSchema we want to read/write
 * @param fileSchema the file schema (when reading it can be different from the requested schema)
 * @return the corresponding serializing/deserializing structure
 */
public MessageColumnIO getColumnIO(MessageType requestedSchema, MessageType fileSchema) {
 return getColumnIO(requestedSchema, fileSchema, true);
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

/**
 * @param requestedSchema the requestedSchema we want to read/write
 * @param fileSchema the file schema (when reading it can be different from the requested schema)
 * @return the corresponding serializing/deserializing structure
 */
public MessageColumnIO getColumnIO(MessageType requestedSchema, MessageType fileSchema) {
 return getColumnIO(requestedSchema, fileSchema, true);
}

public void initialize(ParquetFileReader reader, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();

/**
 * @param schema the schema we want to read/write
 * @return the corresponding serializing/deserializing structure
 */
public MessageColumnIO getColumnIO(MessageType schema) {
 return this.getColumnIO(schema, schema);
}

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

PageReadStore pagesMetaData;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
while (null != (pages = r.readNextRowGroup())) {

/**
 * @param schema the schema we want to read/write
 * @return the corresponding serializing/deserializing structure
 */
public MessageColumnIO getColumnIO(MessageType schema) {
 return this.getColumnIO(schema, schema);
}

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadLinearSVMModel(String pathToMdl) {
  Vector coefficients = null;
  double interceptor = 0;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        interceptor = readSVMInterceptor(g);
        coefficients = readSVMCoefficients(g);
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new SVMLinearClassificationModel(coefficients, interceptor);
}

if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter,
  filterRecords ? filter : FilterCompat.NOOP);

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadLinRegModel(String pathToMdl) {
  Vector coefficients = null;
  double interceptor = 0;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        interceptor = readLinRegInterceptor(g);
        coefficients = readLinRegCoefficients(g);
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new LinearRegressionModel(coefficients, interceptor);
}

if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter,
  filterRecords ? filter : FilterCompat.NOOP);

public void initialize(FileMetaData parquetFileMetadata,
            Path file, List<BlockMetaData> blocks, Configuration configuration)
  throws IOException {
 // initialize a ReadContext for this file
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
   configuration, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.fileSchema = parquetFileMetadata.getSchema();
 this.file = file;
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(
   configuration, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
 List<ColumnDescriptor> columns = requestedSchema.getColumns();
 reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
 for (BlockMetaData block : blocks) {
  total += block.getRowCount();
 }
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
 LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadLogRegModel(String pathToMdl) {
  Vector coefficients = null;
  double interceptor = 0;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        interceptor = readInterceptor(g);
        coefficients = readCoefficients(g);
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new LogisticRegressionModel(coefficients, interceptor);
}

if (Log.INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
startedAssemblingCurrentBlockAt = System.currentTimeMillis();

private static Model loadKMeansModel(String pathToMdl) {
  Vector[] centers = null;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final int rows = (int)pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      centers = new DenseVector[rows];
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        // final int clusterIdx = g.getInteger(0, 0);
        Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);
        final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);
        centers[i] = new DenseVector(amountOfCoefficients);
        for (int j = 0; j < amountOfCoefficients; j++) {
          double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
          centers[i].set(j, coefficient);
        }
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new KMeansModel(centers, new EuclideanDistance());
}

Javadoc

Factory constructing the ColumnIO structure from the schema

Most used methods

Popular in Java

Making http post requests using okhttp
scheduleAtFixedRate (ScheduledExecutorService)
notifyDataSetChanged (ArrayAdapter)
findViewById (Activity)
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
ServletException (javax.servlet)
Defines a general exception a servlet can throw when it encounters difficulty.
JButton (javax.swing)
Top 12 Jupyter Notebook extensions

How to useColumnIOFactory in org.apache.parquet.io

Best Java code snippets using org.apache.parquet.io.ColumnIOFactory (Showing top 20 results out of 315)

How to use
ColumnIOFactory
in
org.apache.parquet.io