org.apache.parquet.io java code examples

 @Override
 int nextInt() {
  try {
   return delegate.readInt();
  } catch (IOException e) {
   throw new ParquetDecodingException(e);
  }
 }
}

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadDecisionTreeModel(String pathToMdl) {
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    final Map<Integer, NodeData> nodes = new TreeMap<>();
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        NodeData nodeData = extractNodeDataFromParquetRow(g);
        nodes.put(nodeData.id, nodeData);
      }
    }
    return buildDecisionTreeModel(nodes);
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return null;
}

long footerLengthIndex = stat.getLen()
  - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length;
stream.seek(footerLengthIndex);
int footerLength = BytesUtils.readIntLittleEndian(stream);
stream.seek(footerLengthIndex - footerLength);
if (LOG.isInfoEnabled()) {
 LOG.info("Caching the footer of length " + footerLength + " for " + cacheKey);

public RecordConsumer getRecordWriter(ColumnWriteStore columns) {
 RecordConsumer recordWriter = new MessageColumnIORecordConsumer(columns);
 if (DEBUG) recordWriter = new RecordConsumerLoggingWrapper(recordWriter);
 return validating ? new ValidatingRecordConsumer(recordWriter, getType()) : recordWriter;
}

/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadLinearSVMModel(String pathToMdl) {
  Vector coefficients = null;
  double interceptor = 0;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        interceptor = readSVMInterceptor(g);
        coefficients = readSVMCoefficients(g);
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new SVMLinearClassificationModel(coefficients, interceptor);
}

private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) {
 try {
  if (maxLevel == 0) {
   return new NullIntIterator();
  }
  return new RLEIntIterator(
    new RunLengthBitPackingHybridDecoder(
      BytesUtils.getWidthFromMaxInt(maxLevel),
      new ByteArrayInputStream(bytes.toByteArray())));
 } catch (IOException e) {
  throw new ParquetDecodingException("could not read levels in page for col " + descriptor, e);
 }
}

/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadLinRegModel(String pathToMdl) {
  Vector coefficients = null;
  double interceptor = 0;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        interceptor = readLinRegInterceptor(g);
        coefficients = readLinRegCoefficients(g);
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new LinearRegressionModel(coefficients, interceptor);
}

private void readPageV2(DataPageV2 page) {
 this.pageValueCount = page.getValueCount();
 this.repetitionLevelColumn = newRLEIterator(descriptor.getMaxRepetitionLevel(),
   page.getRepetitionLevels());
 this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels());
 try {
  LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records");
  initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount());
 } catch (IOException e) {
  throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
 }
}

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 */
private static Model loadLogRegModel(String pathToMdl) {
  Vector coefficients = null;
  double interceptor = 0;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final long rows = pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        interceptor = readInterceptor(g);
        coefficients = readCoefficients(g);
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new LogisticRegressionModel(coefficients, interceptor);
}

private void readPageV1(DataPageV1 page) {
 ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);
 ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL);
 this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
 this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
 try {
  BytesInput bytes = page.getBytes();
  LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records");
  ByteBufferInputStream in = bytes.toInputStream();
  LOG.debug("reading repetition levels at " + in.position());
  rlReader.initFromPage(pageValueCount, in);
  LOG.debug("reading definition levels at " + in.position());
  dlReader.initFromPage(pageValueCount, in);
  LOG.debug("reading data at " + in.position());
  initDataReader(page.getValueEncoding(), in, page.getValueCount());
 } catch (IOException e) {
  throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
 }
}

final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
  final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
    final SimpleGroup g = (SimpleGroup)recordReader.read();
    final int treeID = g.getInteger(0, 0);
    final SimpleGroup nodeDataGroup = (SimpleGroup)g.getGroup(1, 0);

private ByteBuffer getBuffer(int length) {
 try {
  return in.slice(length).order(ByteOrder.LITTLE_ENDIAN);
 } catch (IOException e) {
  throw new ParquetDecodingException("Failed to read " + length + " bytes", e);
 }
}

PageReadStore pagesMetaData;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
  final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema));
  for (int i = 0; i < rows; i++) {
    final SimpleGroup g = (SimpleGroup)recordReader.read();
    int treeId = g.getInteger(0, 0);
    double treeWeight = g.getDouble(2, 0);
PageReadStore pages;
final MessageType schema = r.getFooter().getFileMetaData().getSchema();
final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
while (null != (pages = r.readNextRowGroup())) {
  final long rows = pages.getRowCount();
  final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
  for (int i = 0; i < rows; i++) {
    final SimpleGroup g = (SimpleGroup)recordReader.read();
    final int treeID = g.getInteger(0, 0);
    final SimpleGroup nodeDataGroup = (SimpleGroup)g.getGroup(1, 0);

@Override
public final boolean readBoolean() {
 // TODO: vectorize decoding and keep boolean[] instead of currentByte
 if (bitOffset == 0) {
  try {
   currentByte = (byte) in.read();
  } catch (IOException e) {
   throw new ParquetDecodingException("Failed to read a byte", e);
  }
 }
 boolean v = (currentByte & (1 << bitOffset)) != 0;
 bitOffset += 1;
 if (bitOffset == 8) {
  bitOffset = 0;
 }
 return v;
}

private static Model loadKMeansModel(String pathToMdl) {
  Vector[] centers = null;
  try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
    PageReadStore pages;
    final MessageType schema = r.getFooter().getFileMetaData().getSchema();
    final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
    while (null != (pages = r.readNextRowGroup())) {
      final int rows = (int)pages.getRowCount();
      final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
      centers = new DenseVector[rows];
      for (int i = 0; i < rows; i++) {
        final SimpleGroup g = (SimpleGroup)recordReader.read();
        // final int clusterIdx = g.getInteger(0, 0);
        Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);
        final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);
        centers[i] = new DenseVector(amountOfCoefficients);
        for (int j = 0; j < amountOfCoefficients; j++) {
          double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
          centers[i].set(j, coefficient);
        }
      }
    }
  }
  catch (IOException e) {
    System.out.println("Error reading parquet file.");
    e.printStackTrace();
  }
  return new KMeansModel(centers, new EuclideanDistance());
}

private ByteBuffer getBuffer(int length) {
 try {
  return in.slice(length).order(ByteOrder.LITTLE_ENDIAN);
 } catch (IOException e) {
  throw new ParquetDecodingException("Failed to read " + length + " bytes", e);
 }
}

@Override
public final boolean readBoolean() {
 // TODO: vectorize decoding and keep boolean[] instead of currentByte
 if (bitOffset == 0) {
  try {
   currentByte = (byte) in.read();
  } catch (IOException e) {
   throw new ParquetDecodingException("Failed to read a byte", e);
  }
 }
 boolean v = (currentByte & (1 << bitOffset)) != 0;
 bitOffset += 1;
 if (bitOffset == 8) {
  bitOffset = 0;
 }
 return v;
}

  throw new ParquetDecodingException("not a valid mode " + this.mode);
throw new ParquetDecodingException("Failed to read from input stream", e);

  throw new ParquetDecodingException("not a valid mode " + this.mode);
throw new ParquetDecodingException("Failed to read from input stream", e);

 return;
default:
 throw new ParquetDecodingException("not a valid mode " + this.mode);

How to use org.apache.parquet.io

Best Java code snippets using org.apache.parquet.io (Showing top 20 results out of 315)