private void flushRowGroupToStore() throws IOException { recordConsumer.flush(); LOG.info("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) { LOG.warn("Too much memory used: {}", columnStore.memUsageString()); } if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); columnStore.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); this.nextRowGroupSize = Math.min( parquetFileWriter.getNextRowGroupSize(), rowGroupSizeThreshold); } columnStore = null; pageStore = null; }
/** * @return the total size of data written to the file and buffered in memory */ public long getDataSize() { return lastRowGroupEndPos + columnStore.getBufferedSize(); }
public void setColumnStore(ColumnWriteStore columnStore) { this.columnWriter = columnStore.getColumnWriter(desc); }
private void flushAndClose() throws IOException { if(parquetFileWriter == null){ return; } if (recordCount > 0) { long memSize = store.getBufferedSize(); parquetFileWriter.startBlock(recordCount); consumer.flush(); store.flush(); ColumnChunkPageWriteStoreExposer.flushPageStore(pageStore, parquetFileWriter); parquetFileWriter.endBlock(); long recordsWritten = recordCount; // we are writing one single block per file parquetFileWriter.end(extraMetaData); byte[] metadata = this.trackingConverter == null ? null : trackingConverter.getMetadata(); final long fileSize = parquetFileWriter.getPos(); listener.recordsWritten(recordsWritten, fileSize, path.toString(), metadata /** TODO: add parquet footer **/, partition.getBucketNumber()); parquetFileWriter = null; updateStats(memSize, recordCount); recordCount = 0; } if(store != null){ store.close(); } store = null; pageStore = null; index++; }
private void flush() throws IOException { try { if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); consumer.flush(); store.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); // we are writing one single block per file parquetFileWriter.end(extraMetaData); parquetFileWriter = null; } } finally { store.close(); pageStore.close(); store = null; pageStore = null; index++; } }
@Override public void add(T value) { recordCount += 1; model.write(0, value); writeStore.endRecord(); checkSize(); }
private void flushRowGroup(boolean finished) { try { if (recordCount > 0) { writer.startBlock(recordCount); writeStore.flush(); flushPageStoreToWriter.invoke(writer); writer.endBlock(); if (!finished) { startRowGroup(); } } } catch (IOException e) { throw new RuntimeIOException(e, "Failed to flush row group"); } }
@Override public void close() throws IOException { flushRowGroup(true); writeStore.close(); writer.end(metadata); } }
@Override public void endMessage() { writeNullForMissingFieldsAtCurrentLevel(); columns.endRecord(); if (DEBUG) log("< MESSAGE END >"); if (DEBUG) printState(); }
/** * @return the total size of data written to the file and buffered in memory */ public long getDataSize() { return lastRowGroupEndPos + columnStore.getBufferedSize(); }
private void flushRowGroupToStore() throws IOException { recordConsumer.flush(); LOG.debug("Flushing mem columnStore to file. allocated memory: {}", columnStore.getAllocatedSize()); if (columnStore.getAllocatedSize() > (3 * rowGroupSizeThreshold)) { LOG.warn("Too much memory used: {}", columnStore.memUsageString()); } if (recordCount > 0) { parquetFileWriter.startBlock(recordCount); columnStore.flush(); pageStore.flushToFileWriter(parquetFileWriter); recordCount = 0; parquetFileWriter.endBlock(); this.nextRowGroupSize = Math.min( parquetFileWriter.getNextRowGroupSize(), rowGroupSizeThreshold); } columnStore = null; pageStore = null; }
@Override public void endMessage() { writeNullForMissingFieldsAtCurrentLevel(); columns.endRecord(); if (DEBUG) log("< MESSAGE END >"); if (DEBUG) printState(); }
public MessageColumnIORecordConsumer(ColumnWriteStore columns) { this.columns = columns; int maxDepth = 0; this.columnWriter = new ColumnWriter[MessageColumnIO.this.getLeaves().size()]; for (PrimitiveColumnIO primitiveColumnIO : MessageColumnIO.this.getLeaves()) { ColumnWriter w = columns.getColumnWriter(primitiveColumnIO.getColumnDescriptor()); maxDepth = Math.max(maxDepth, primitiveColumnIO.getFieldPath().length); columnWriter[primitiveColumnIO.getId()] = w; buildGroupToLeafWriterMap(primitiveColumnIO, w); } fieldsWritten = new FieldsMarker[maxDepth]; for (int i = 0; i < maxDepth; i++) { fieldsWritten[i] = new FieldsMarker(); } r = new int[maxDepth]; }
private void checkSize() { if (recordCount >= nextCheckRecordCount) { long bufferedSize = writeStore.getBufferedSize(); double avgRecordSize = ((double) bufferedSize) / recordCount; if (bufferedSize > (nextRowGroupSize - 2 * avgRecordSize)) { flushRowGroup(false); } else { long remainingSpace = nextRowGroupSize - bufferedSize; long remainingRecords = (long) (remainingSpace / avgRecordSize); this.nextCheckRecordCount = recordCount + min(max(remainingRecords / 2, 100), 10000); } } }
if (repetitionLevel == 0) { columnWriteStore.endRecord();
public MessageColumnIORecordConsumer(ColumnWriteStore columns) { this.columns = columns; int maxDepth = 0; this.columnWriter = new ColumnWriter[MessageColumnIO.this.getLeaves().size()]; for (PrimitiveColumnIO primitiveColumnIO : MessageColumnIO.this.getLeaves()) { ColumnWriter w = columns.getColumnWriter(primitiveColumnIO.getColumnDescriptor()); maxDepth = Math.max(maxDepth, primitiveColumnIO.getFieldPath().length); columnWriter[primitiveColumnIO.getId()] = w; buildGroupToLeafWriterMap(primitiveColumnIO, w); } fieldsWritten = new FieldsMarker[maxDepth]; for (int i = 0; i < maxDepth; i++) { fieldsWritten[i] = new FieldsMarker(); } r = new int[maxDepth]; }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = store.getBufferedSize(); if (memSize > blockSize) { logger.debug("Reached block size " + blockSize); flush(); newSchema(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); } else { float recordSize = (float) memSize / recordCount; recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(blockSize / recordSize)) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); long recordSize = memSize / recordCount; // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); this.lastRowGroupEndPos = parquetFileWriter.getPos(); } else { recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); long recordSize = memSize / recordCount; // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { LOG.debug("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); this.lastRowGroupEndPos = parquetFileWriter.getPos(); } else { recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck && recordCount >= minRecordsForFlush) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = store.getBufferedSize(); if (context.getAllocator().getHeadroom() < memoryThreshold || memSize >= blockSize) { logger.debug("Reached block size " + blockSize); flushAndClose(); newSchema(); } else { // Find the average record size for encoded records so far float recordSize = ((float) memSize) / recordCount; final long recordsCouldFitInRemainingSpace = (long)((blockSize - memSize)/recordSize); // try to check again when reached half of the number of records that could potentially fit in remaining space. recordCountForNextMemCheck = recordCount + // Upper bound by the max count check. There is no lower bound, as it could cause files bigger than // blockSize if the remaining records that could fit is very few (usually when we are close to the goal). min(MAXIMUM_RECORD_COUNT_FOR_CHECK, recordsCouldFitInRemainingSpace/2); } } }