private ValuesWriter getBooleanValuesWriter() { // no dictionary encoding for boolean return new RunLengthBitPackingHybridValuesWriter(1, parquetProperties.getInitialSlabSize(), parquetProperties.getPageSizeThreshold(), parquetProperties.getAllocator()); }
long usedMem = writer.getCurrentPageBufferedSize(); long rows = rowCount - writer.getRowsWrittenSoFar(); long remainingMem = props.getPageSizeThreshold() - usedMem; if (remainingMem <= thresholdTolerance) { writer.writePage(rowCount); remainingMem = props.getPageSizeThreshold(); props.getMaxRowCountForPageSizeCheck() : (long)((float)rows) / usedMem * remainingMem; if (rowsToFillPage < minRecordToWait) { minRecordToWait = props.getMinRowCountForPageSizeCheck(); if(props.estimateNextSizeCheck()) { max(minRecordToWait / 2, props.getMinRowCountForPageSizeCheck()), props.getMaxRowCountForPageSizeCheck()); } else { rowCountForNextSizeCheck = rowCount + props.getMinRowCountForPageSizeCheck();
static DictionaryValuesWriter dictionaryWriter(ColumnDescriptor path, ParquetProperties properties, Encoding dictPageEncoding, Encoding dataPageEncoding) { switch (path.getType()) { case BOOLEAN: throw new IllegalArgumentException("no dictionary encoding for BOOLEAN"); case BINARY: return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator()); case INT32: return new DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator()); case INT64: return new DictionaryValuesWriter.PlainLongDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator()); case INT96: return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), 12, dataPageEncoding, dictPageEncoding, properties.getAllocator()); case DOUBLE: return new DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator()); case FLOAT: return new DictionaryValuesWriter.PlainFloatDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), dataPageEncoding, dictPageEncoding, properties.getAllocator()); case FIXED_LEN_BYTE_ARRAY: return new DictionaryValuesWriter.PlainFixedLenArrayDictionaryValuesWriter(properties.getDictionaryPageSizeThreshold(), path.getTypeLength(), dataPageEncoding, dictPageEncoding, properties.getAllocator()); default: throw new IllegalArgumentException("Unknown type " + path.getType()); } }
public ColumnWriterV1(ColumnDescriptor path, PageWriter pageWriter, ParquetProperties props) { this.path = path; this.pageWriter = pageWriter; this.props = props; // initial check of memory usage. So that we have enough data to make an initial prediction this.valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); resetStatistics(); this.repetitionLevelColumn = props.newRepetitionLevelWriter(path); this.definitionLevelColumn = props.newDefinitionLevelWriter(path); this.dataColumn = props.newValuesWriter(path); }
@Deprecated ColumnWriteStoreBase( final PageWriteStore pageWriteStore, final ParquetProperties props) { this.props = props; this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); this.columns = new TreeMap<>(); this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit()); columnWriterProvider = new ColumnWriterProvider() { @Override public ColumnWriter getColumnWriter(ColumnDescriptor path) { ColumnWriterBase column = columns.get(path); if (column == null) { column = createColumnWriter(path, pageWriteStore.getPageWriter(path), props); columns.put(path, column); } return column; } }; }
final WriteSupport<T> writeSupport = getWriteSupport(conf); ParquetProperties props = ParquetProperties.builder() .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) LOG.info("Parquet page size to {}", props.getPageSizeThreshold()); LOG.info("Parquet dictionary page size to {}", props.getDictionaryPageSizeThreshold()); LOG.info("Dictionary is {}", (props.isEnableDictionary() ? "on" : "off")); LOG.info("Validation is {}", (validating ? "on" : "off")); LOG.info("Writer version is: {}", props.getWriterVersion()); LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize); LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant")); LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck()); LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck()); LOG.info("Truncate length for column indexes is: {}", props.getColumnIndexTruncateLength()); LOG.info("Page row count limit to {}", props.getPageRowCountLimit()); init.getSchema(), Mode.CREATE, blockSize, maxPaddingSize, props.getColumnIndexTruncateLength()); w.start();
+ definitionLevelColumn.getBufferedSize() + dataColumn.getBufferedSize(); if (memSize > props.getPageSizeThreshold()) { if (props.estimateNextSizeCheck()) { valueCountForNextSizeCheck = valueCount / 2; } else { valueCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); } else if (props.estimateNextSizeCheck()) { valueCountForNextSizeCheck = (int)(valueCount + ((float)valueCount * props.getPageSizeThreshold() / memSize)) / 2 + 1; } else { valueCountForNextSizeCheck += props.getMinRowCountForPageSizeCheck();
mode, writeSupport, compressionCodecName, blockSize, validating, conf, MAX_PADDING_SIZE_DEFAULT, ParquetProperties.builder() .withPageSize(pageSize) .withDictionaryPageSize(dictionaryPageSize) fileWriter.start(); this.codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold()); CodecFactory.BytesCompressor compressor = codecFactory.getCompressor(compressionCodecName); this.writer = new InternalParquetRecordWriter<T>(
private void startRowGroup() { try { this.nextRowGroupSize = min(writer.getNextRowGroupSize(), targetRowGroupSize); } catch (IOException e) { throw new RuntimeIOException(e); } this.nextCheckRecordCount = min(max(recordCount / 2, 100), 10000); this.recordCount = 0; PageWriteStore pageStore = pageStoreCtor.newInstance( compressor, parquetSchema, props.getAllocator()); this.flushPageStoreToWriter = flushToWriter.bind(pageStore); this.writeStore = props.newColumnWriteStore(parquetSchema, pageStore); model.setColumnStore(writeStore); }
public ColumnWriteStoreV2( MessageType schema, PageWriteStore pageWriteStore, ParquetProperties props) { this.props = props; this.thresholdTolerance = (long)(props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); Map<ColumnDescriptor, ColumnWriterV2> mcolumns = new TreeMap<ColumnDescriptor, ColumnWriterV2>(); for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); mcolumns.put(path, new ColumnWriterV2(path, pageWriter, props)); } this.columns = unmodifiableMap(mcolumns); this.writers = this.columns.values(); this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck(); }
private void initStore() { pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator(), props.getColumnIndexTruncateLength()); columnStore = props.newColumnWriteStore(schema, pageStore); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); this.recordConsumer = columnIO.getRecordWriter(columnStore); writeSupport.prepareForWrite(recordConsumer); }
mode, writeSupport, compressionCodecName, blockSize, validating, conf, MAX_PADDING_SIZE_DEFAULT, ParquetProperties.builder() .withPageSize(pageSize) .withDictionaryPageSize(dictionaryPageSize) file, schema, mode, rowGroupSize, maxPaddingSize, encodingProps.getColumnIndexTruncateLength()); fileWriter.start(); this.codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold()); CodecFactory.BytesCompressor compressor = codecFactory.getCompressor(compressionCodecName); this.writer = new InternalParquetRecordWriter<T>(
@Override public void initialize(ParquetProperties properties) { if (properties.getWriterVersion() == WriterVersion.PARQUET_1_0) { delegateFactory = DEFAULT_V1_WRITER_FACTORY; } else { delegateFactory = DEFAULT_V2_WRITER_FACTORY; } delegateFactory.initialize(properties); }
static ValuesWriter dictWriterWithFallBack(ColumnDescriptor path, ParquetProperties parquetProperties, Encoding dictPageEncoding, Encoding dataPageEncoding, ValuesWriter writerToFallBackTo) { if (parquetProperties.isEnableDictionary()) { return FallbackValuesWriter.of( dictionaryWriter(path, parquetProperties, dictPageEncoding, dataPageEncoding), writerToFallBackTo); } else { return writerToFallBackTo; } } }
@SuppressWarnings("unchecked") ParquetWriter(Configuration conf, OutputFile output, Schema schema, long rowGroupSize, Map<String, String> metadata, Function<MessageType, ParquetValueWriter<?>> createWriterFunc, CompressionCodecName codec) { this.output = output; this.targetRowGroupSize = rowGroupSize; this.metadata = ImmutableMap.copyOf(metadata); this.compressor = new CodecFactory(conf, props.getPageSizeThreshold()).getCompressor(codec); this.parquetSchema = convert(schema, "table"); this.model = (ParquetValueWriter<T>) createWriterFunc.apply(parquetSchema); try { this.writer = new ParquetFileWriter(ParquetIO.file(output, conf), parquetSchema, ParquetFileWriter.Mode.OVERWRITE, rowGroupSize, 0); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to create Parquet file"); } try { writer.start(); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to start Parquet file writer"); } startRowGroup(); }
public ParquetProperties build() { ParquetProperties properties = new ParquetProperties(writerVersion, pageSize, dictPageSize, enableDict, minRowCountForPageSizeCheck, maxRowCountForPageSizeCheck, estimateNextSizeCheck, allocator, valuesWriterFactory, columnIndexTruncateLength, pageRowCountLimit); // we pass a constructed but uninitialized factory to ParquetProperties above as currently // creation of ValuesWriters is invoked from within ParquetProperties. In the future // we'd like to decouple that and won't need to pass an object to properties and then pass the // properties to the object. valuesWriterFactory.initialize(properties); return properties; }
ParquetProperties parquetProperties = ParquetProperties.builder() .withPageSize(pageSize) .withDictionaryEncoding(enableDictionary)
final WriteSupport<T> writeSupport = getWriteSupport(conf); ParquetProperties props = ParquetProperties.builder() .withPageSize(getPageSize(conf)) .withDictionaryPageSize(getDictionaryPageSize(conf)) LOG.info("Parquet page size to {}", props.getPageSizeThreshold()); LOG.info("Parquet dictionary page size to {}", props.getDictionaryPageSizeThreshold()); LOG.info("Dictionary is {}", (props.isEnableDictionary() ? "on" : "off")); LOG.info("Validation is {}", (validating ? "on" : "off")); LOG.info("Writer version is: {}", props.getWriterVersion()); LOG.info("Maximum row group padding size is {} bytes", maxPaddingSize); LOG.info("Page size checking is: {}", (props.estimateNextSizeCheck() ? "estimated" : "constant")); LOG.info("Min row count for page size check is: {}", props.getMinRowCountForPageSizeCheck()); LOG.info("Max row count for page size check is: {}", props.getMaxRowCountForPageSizeCheck());
ColumnWriteStoreBase( MessageType schema, PageWriteStore pageWriteStore, ParquetProperties props) { this.props = props; this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO); Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>(); for (ColumnDescriptor path : schema.getColumns()) { PageWriter pageWriter = pageWriteStore.getPageWriter(path); mcolumns.put(path, createColumnWriter(path, pageWriter, props)); } this.columns = unmodifiableMap(mcolumns); this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit()); columnWriterProvider = new ColumnWriterProvider() { @Override public ColumnWriter getColumnWriter(ColumnDescriptor path) { return columns.get(path); } }; }
boolean validating, WriterVersion writerVersion) { ParquetProperties props = ParquetProperties.builder() .withPageSize(pageSize) .withDictionaryPageSize(dictionaryPageSize) WriterVersion writerVersion, MemoryManager memoryManager) { ParquetProperties props = ParquetProperties.builder() .withPageSize(pageSize) .withDictionaryPageSize(dictionaryPageSize) MemoryManager memoryManager, Configuration conf) { this.codecFactory = new CodecFactory(conf, props.getPageSizeThreshold()); internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema, extraMetaData, blockSize, codecFactory.getCompressor(codec), validating,