/** * @return the total size of data written to the file and buffered in memory */ public long getDataSize() { return lastRowGroupEndPos + columnStore.getBufferedSize(); }
/** * @return the total size of data written to the file and buffered in memory */ public long getDataSize() { return lastRowGroupEndPos + columnStore.getBufferedSize(); }
private void checkSize() { if (recordCount >= nextCheckRecordCount) { long bufferedSize = writeStore.getBufferedSize(); double avgRecordSize = ((double) bufferedSize) / recordCount; if (bufferedSize > (nextRowGroupSize - 2 * avgRecordSize)) { flushRowGroup(false); } else { long remainingSpace = nextRowGroupSize - bufferedSize; long remainingRecords = (long) (remainingSpace / avgRecordSize); this.nextCheckRecordCount = recordCount + min(max(remainingRecords / 2, 100), 10000); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = store.getBufferedSize(); if (memSize > blockSize) { logger.debug("Reached block size " + blockSize); flush(); newSchema(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); } else { float recordSize = (float) memSize / recordCount; recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(blockSize / recordSize)) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); long recordSize = memSize / recordCount; // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { LOG.debug("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); this.lastRowGroupEndPos = parquetFileWriter.getPos(); } else { recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = columnStore.getBufferedSize(); long recordSize = memSize / recordCount; // flush the row group if it is within ~2 records of the limit // it is much better to be slightly under size than to be over at all if (memSize > (nextRowGroupSize - 2 * recordSize)) { LOG.info("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount); flushRowGroupToStore(); initStore(); recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCount / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); this.lastRowGroupEndPos = parquetFileWriter.getPos(); } else { recordCountForNextMemCheck = min( max(MINIMUM_RECORD_COUNT_FOR_CHECK, (recordCount + (long)(nextRowGroupSize / ((float)recordSize))) / 2), // will check halfway recordCount + MAXIMUM_RECORD_COUNT_FOR_CHECK // will not look more than max records ahead ); LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck); } } }
private void checkBlockSizeReached() throws IOException { if (recordCount >= recordCountForNextMemCheck && recordCount >= minRecordsForFlush) { // checking the memory size is relatively expensive, so let's not do it for every record. long memSize = store.getBufferedSize(); if (context.getAllocator().getHeadroom() < memoryThreshold || memSize >= blockSize) { logger.debug("Reached block size " + blockSize); flushAndClose(); newSchema(); } else { // Find the average record size for encoded records so far float recordSize = ((float) memSize) / recordCount; final long recordsCouldFitInRemainingSpace = (long)((blockSize - memSize)/recordSize); // try to check again when reached half of the number of records that could potentially fit in remaining space. recordCountForNextMemCheck = recordCount + // Upper bound by the max count check. There is no lower bound, as it could cause files bigger than // blockSize if the remaining records that could fit is very few (usually when we are close to the goal). min(MAXIMUM_RECORD_COUNT_FOR_CHECK, recordsCouldFitInRemainingSpace/2); } } }
private void flushAndClose() throws IOException { if(parquetFileWriter == null){ return; } if (recordCount > 0) { long memSize = store.getBufferedSize(); parquetFileWriter.startBlock(recordCount); consumer.flush(); store.flush(); ColumnChunkPageWriteStoreExposer.flushPageStore(pageStore, parquetFileWriter); parquetFileWriter.endBlock(); long recordsWritten = recordCount; // we are writing one single block per file parquetFileWriter.end(extraMetaData); byte[] metadata = this.trackingConverter == null ? null : trackingConverter.getMetadata(); final long fileSize = parquetFileWriter.getPos(); listener.recordsWritten(recordsWritten, fileSize, path.toString(), metadata /** TODO: add parquet footer **/, partition.getBucketNumber()); parquetFileWriter = null; updateStats(memSize, recordCount); recordCount = 0; } if(store != null){ store.close(); } store = null; pageStore = null; index++; }