org.apache.parquet.column.ColumnWriteStore java code examples

private void flushRowGroupToStore()
  throws IOException {
 recordConsumer.flush();
 LOG.info("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize());
 if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) {
  LOG.warn("Too much memory used: {}", columnStore.memUsageString());
 }
 if (recordCount > 0) {
  parquetFileWriter.startBlock(recordCount);
  columnStore.flush();
  pageStore.flushToFileWriter(parquetFileWriter);
  recordCount = 0;
  parquetFileWriter.endBlock();
  this.nextRowGroupSize = Math.min(
    parquetFileWriter.getNextRowGroupSize(),
    rowGroupSizeThreshold);
 }
 columnStore = null;
 pageStore = null;
}

/**
 * @return the total size of data written to the file and buffered in memory
 */
public long getDataSize() {
 return lastRowGroupEndPos + columnStore.getBufferedSize();
}

public void setColumnStore(ColumnWriteStore columnStore) {
 this.columnWriter = columnStore.getColumnWriter(desc);
}

private void flushAndClose() throws IOException {
 if(parquetFileWriter == null){
  return;
 }
 if (recordCount > 0) {
  long memSize = store.getBufferedSize();
  parquetFileWriter.startBlock(recordCount);
  consumer.flush();
  store.flush();
  ColumnChunkPageWriteStoreExposer.flushPageStore(pageStore, parquetFileWriter);
  parquetFileWriter.endBlock();
  long recordsWritten = recordCount;
  // we are writing one single block per file
  parquetFileWriter.end(extraMetaData);
  byte[] metadata = this.trackingConverter == null ? null : trackingConverter.getMetadata();
  final long fileSize = parquetFileWriter.getPos();
  listener.recordsWritten(recordsWritten, fileSize, path.toString(), metadata /** TODO: add parquet footer **/, partition.getBucketNumber());
  parquetFileWriter = null;
  updateStats(memSize, recordCount);
  recordCount = 0;
 }
 if(store != null){
  store.close();
 }
 store = null;
 pageStore = null;
 index++;
}

private void flush() throws IOException {
 try {
  if (recordCount > 0) {
   parquetFileWriter.startBlock(recordCount);
   consumer.flush();
   store.flush();
   pageStore.flushToFileWriter(parquetFileWriter);
   recordCount = 0;
   parquetFileWriter.endBlock();
   // we are writing one single block per file
   parquetFileWriter.end(extraMetaData);
   parquetFileWriter = null;
  }
 } finally {
  store.close();
  pageStore.close();
  store = null;
  pageStore = null;
  index++;
 }
}

@Override
public void add(T value) {
 recordCount += 1;
 model.write(0, value);
 writeStore.endRecord();
 checkSize();
}

private void flushRowGroup(boolean finished) {
 try {
  if (recordCount > 0) {
   writer.startBlock(recordCount);
   writeStore.flush();
   flushPageStoreToWriter.invoke(writer);
   writer.endBlock();
   if (!finished) {
    startRowGroup();
   }
  }
 } catch (IOException e) {
  throw new RuntimeIOException(e, "Failed to flush row group");
 }
}

 @Override
 public void close() throws IOException {
  flushRowGroup(true);
  writeStore.close();
  writer.end(metadata);
 }
}

@Override
public void endMessage() {
 writeNullForMissingFieldsAtCurrentLevel();
 columns.endRecord();
 if (DEBUG) log("< MESSAGE END >");
 if (DEBUG) printState();
}

/**
 * @return the total size of data written to the file and buffered in memory
 */
public long getDataSize() {
 return lastRowGroupEndPos + columnStore.getBufferedSize();
}

private void flushRowGroupToStore()
  throws IOException {
 recordConsumer.flush();
 LOG.debug("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize());
 if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) {
  LOG.warn("Too much memory used: {}", columnStore.memUsageString());
 }
 if (recordCount > 0) {
  parquetFileWriter.startBlock(recordCount);
  columnStore.flush();
  pageStore.flushToFileWriter(parquetFileWriter);
  recordCount = 0;
  parquetFileWriter.endBlock();
  this.nextRowGroupSize = Math.min(
    parquetFileWriter.getNextRowGroupSize(),
    rowGroupSizeThreshold);
 }
 columnStore = null;
 pageStore = null;
}

@Override
public void endMessage() {
 writeNullForMissingFieldsAtCurrentLevel();
 columns.endRecord();
 if (DEBUG) log("< MESSAGE END >");
 if (DEBUG) printState();
}

public MessageColumnIORecordConsumer(ColumnWriteStore columns) {
 this.columns = columns;
 int maxDepth = 0;
 this.columnWriter = new ColumnWriter[MessageColumnIO.this.getLeaves().size()];
 for (PrimitiveColumnIO primitiveColumnIO : MessageColumnIO.this.getLeaves()) {
  ColumnWriter w = columns.getColumnWriter(primitiveColumnIO.getColumnDescriptor());
  maxDepth = Math.max(maxDepth, primitiveColumnIO.getFieldPath().length);
  columnWriter[primitiveColumnIO.getId()] = w;
  buildGroupToLeafWriterMap(primitiveColumnIO, w);
 }
 fieldsWritten = new FieldsMarker[maxDepth];
 for (int i = 0; i < maxDepth; i++) {
  fieldsWritten[i] = new FieldsMarker();
 }
 r = new int[maxDepth];
}

private void checkSize() {
 if (recordCount >= nextCheckRecordCount) {
  long bufferedSize = writeStore.getBufferedSize();
  double avgRecordSize = ((double) bufferedSize) / recordCount;
  if (bufferedSize > (nextRowGroupSize - 2 * avgRecordSize)) {
   flushRowGroup(false);
  } else {
   long remainingSpace = nextRowGroupSize - bufferedSize;
   long remainingRecords = (long) (remainingSpace / avgRecordSize);
   this.nextCheckRecordCount = recordCount + min(max(remainingRecords / 2, 100), 10000);
  }
 }
}

if (repetitionLevel == 0) {
 columnWriteStore.endRecord();

public MessageColumnIORecordConsumer(ColumnWriteStore columns) {
 this.columns = columns;
 int maxDepth = 0;
 this.columnWriter = new ColumnWriter[MessageColumnIO.this.getLeaves().size()];
 for (PrimitiveColumnIO primitiveColumnIO : MessageColumnIO.this.getLeaves()) {
  ColumnWriter w = columns.getColumnWriter(primitiveColumnIO.getColumnDescriptor());
  maxDepth = Math.max(maxDepth, primitiveColumnIO.getFieldPath().length);
  columnWriter[primitiveColumnIO.getId()] = w;
  buildGroupToLeafWriterMap(primitiveColumnIO, w);
 }
 fieldsWritten = new FieldsMarker[maxDepth];
 for (int i = 0; i < maxDepth; i++) {
  fieldsWritten[i] = new FieldsMarker();
 }
 r = new int[maxDepth];
}

private void checkBlockSizeReached() throws IOException {
 if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
  long memSize = store.getBufferedSize();
  if (memSize > blockSize) {
   logger.debug("Reached block size " + blockSize);
   flush();
   newSchema();
   recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
  } else {
   float recordSize = (float) memSize / recordCount;
   recordCountForNextMemCheck = min(
       max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(blockSize / recordSize)) / 2), // will check halfway
       recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead
   );
  }
 }
}

private void checkBlockSizeReached() throws IOException {
 if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
  long memSize = columnStore.getBufferedSize();
  long recordSize = memSize / recordCount;
  // flush the row group if it is within ~2 records of the limit
  // it is much better to be slightly under size than to be over at all
  if (memSize > (nextRowGroupSize - 2 * recordSize)) {
   LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount);
   flushRowGroupToStore();
   initStore();
   recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
   this.lastRowGroupEndPos = parquetFileWriter.getPos();
  } else {
   recordCountForNextMemCheck = min(
     max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway
     recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead
     );
   LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck);
  }
 }
}

private void checkBlockSizeReached() throws IOException {
 if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record.
  long memSize = columnStore.getBufferedSize();
  long recordSize = memSize / recordCount;
  // flush the row group if it is within ~2 records of the limit
  // it is much better to be slightly under size than to be over at all
  if (memSize > (nextRowGroupSize - 2 * recordSize)) {
   LOG.debug("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount);
   flushRowGroupToStore();
   initStore();
   recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK);
   this.lastRowGroupEndPos = parquetFileWriter.getPos();
  } else {
   recordCountForNextMemCheck = min(
     max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway
     recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead
     );
   LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck);
  }
 }
}

private void checkBlockSizeReached() throws IOException {
 if (recordCount >= recordCountForNextMemCheck && recordCount >= minRecordsForFlush) { // checking the memory size is relatively expensive, so let's not do it for every record.
  long memSize = store.getBufferedSize();
  if (context.getAllocator().getHeadroom() < memoryThreshold || memSize >= blockSize) {
   logger.debug("Reached block size " + blockSize);
   flushAndClose();
   newSchema();
  } else {
   // Find the average record size for encoded records so far
   float recordSize = ((float) memSize) / recordCount;
   final long recordsCouldFitInRemainingSpace = (long)((blockSize - memSize)/recordSize);
   // try to check again when reached half of the number of records that could potentially fit in remaining space.
   recordCountForNextMemCheck = recordCount +
     // Upper bound by the max count check. There is no lower bound, as it could cause files bigger than
     // blockSize if the remaining records that could fit is very few (usually when we are close to the goal).
     min(MAXIMUM_RECORD_COUNT_FOR_CHECK, recordsCouldFitInRemainingSpace/2);
  }
 }
}

Javadoc

Container which can construct writers for multiple columns to be stored together.

Most used methods

flush
when we are done writing to flush to the underlying storage
getBufferedSize
used to flush row groups to disk
endRecord
called to notify of record boundaries
close
Close the related output stream and release any resources
getColumnWriter
getAllocatedSize
used for information
memUsageString
used for debugging pupose

Popular in Java

Updating database using SQL prepared statement
onRequestPermissionsResult (Fragment)
getSupportFragmentManager (FragmentActivity)
getContentResolver (Context)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
ConnectException (java.net)
A ConnectException is thrown if a connection cannot be established to a remote host on a specific po
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
Best plugins for Eclipse

How to useColumnWriteStore in org.apache.parquet.column

Best Java code snippets using org.apache.parquet.column.ColumnWriteStore (Showing top 20 results out of 315)

How to use
ColumnWriteStore
in
org.apache.parquet.column