Refine search
private void readPageV2(DataPageV2 page) { this.pageValueCount = page.getValueCount(); this.repetitionLevelColumn = newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); try { LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records"); initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } }
private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { try { if (maxLevel == 0) { return new NullIntIterator(); } return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( BytesUtils.getWidthFromMaxInt(maxLevel), new ByteArrayInputStream(bytes.toByteArray()))); } catch (IOException e) { throw new ParquetDecodingException("could not read levels in page for col " + descriptor, e); } }
@Override public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException { if (this.dictionaryPage != null) { throw new ParquetEncodingException("Only one dictionary page is allowed"); } BytesInput dictionaryBytes = dictionaryPage.getBytes(); int uncompressedSize = (int)dictionaryBytes.size(); BytesInput compressedBytes = compressor.compress(dictionaryBytes); this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding()); }
private static DictionaryPage reusableCopy(DictionaryPage dict) { if (dict == null) { return null; } try { return new DictionaryPage( BytesInput.from(dict.getBytes().toByteArray()), dict.getDictionarySize(), dict.getEncoding()); } catch (IOException e) { throw new ParquetDecodingException("Cannot read dictionary", e); } }
/** * writes a dictionary page page * @param dictionaryPage the dictionary page * @throws IOException if there is an error while writing */ public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException { state = state.write(); LOG.debug("{}: write dictionary page: {} values", out.getPos(), dictionaryPage.getDictionarySize()); currentChunkDictionaryPageOffset = out.getPos(); int uncompressedSize = dictionaryPage.getUncompressedSize(); int compressedPageSize = (int)dictionaryPage.getBytes().size(); // TODO: fix casts metadataConverter.writeDictionaryPageHeader( uncompressedSize, compressedPageSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding(), out); long headerSize = out.getPos() - currentChunkDictionaryPageOffset; this.uncompressedLength += uncompressedSize + headerSize; this.compressedLength += compressedPageSize + headerSize; LOG.debug("{}: write dictionary page content {}", out.getPos(), compressedPageSize); dictionaryPage.getBytes().writeAllTo(out); encodingStatsBuilder.addDictEncoding(dictionaryPage.getEncoding()); currentEncodings.add(dictionaryPage.getEncoding()); }
/** * Creates a reader for definition and repetition levels, returning an optimized one if * the levels are not needed. */ protected static IntIterator createRLEIterator( int maxLevel, BytesInput bytes, ColumnDescriptor descriptor) throws IOException { try { if (maxLevel == 0) return new NullIntIterator(); return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( BytesUtils.getWidthFromMaxInt(maxLevel), bytes.toInputStream())); } catch (IOException e) { throw new IOException("could not read levels in page for col " + descriptor, e); } }
@Override public BytesInput getBytes() { int maxDicId = getDictionarySize() - 1; LOG.debug("max dic id {}", maxDicId); int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId); int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10); RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator); encoders.add(encoder); IntIterator iterator = encodedValues.iterator(); try { while (iterator.hasNext()) { encoder.writeInt(iterator.next()); } // encodes the bit width byte[] bytesHeader = new byte[] { (byte) bitWidth }; BytesInput rleEncodedBytes = encoder.toBytes(); LOG.debug("rle encoded bytes {}", rleEncodedBytes.size()); BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes); // remember size of dictionary when we last wrote a page lastUsedDictionarySize = getDictionarySize(); lastUsedDictionaryByteSize = dictionaryByteSize; return bytes; } catch (IOException e) { throw new ParquetEncodingException("could not encode the values", e); } }
@SuppressWarnings("unused") @Override public void writeAllTo(OutputStream out) throws IOException { for (BytesInput input : inputs) { LOG.debug("write {} bytes to out", input.size()); if (input instanceof SequenceBytesIn) LOG.debug("{"); input.writeAllTo(out); if (input instanceof SequenceBytesIn) LOG.debug("}"); } }
/** * @param dictionaryPage a dictionary page of encoded double values * @throws IOException if there is an exception while decoding the dictionary page */ public PlainDoubleDictionary(DictionaryPage dictionaryPage) throws IOException { super(dictionaryPage); ByteBufferInputStream in = dictionaryPage.getBytes().toInputStream(); doubleDictionaryContent = new double[dictionaryPage.getDictionarySize()]; DoublePlainValuesReader doubleReader = new DoublePlainValuesReader(); doubleReader.initFromPage(dictionaryPage.getDictionarySize(), in); for (int i = 0; i < doubleDictionaryContent.length; i++) { doubleDictionaryContent[i] = doubleReader.readDouble(); } }
public static Dictionary readDictionary(FSDataInputStream in, ColumnDescriptor column, PageHeaderWithOffset pageHeader, BytesDecompressor decompressor) throws IOException { in.seek(pageHeader.getOffset()); final byte[] data = new byte[pageHeader.getPageHeader().getCompressed_page_size()]; int read = in.read(data); if (read != data.length) { throw new IOException(format("Failed to read dictionary page, read %d bytes, expected %d", read, data.length)); } final DictionaryPage dictionaryPage = new DictionaryPage( decompressor.decompress(BytesInput.from(data), pageHeader.getPageHeader().getUncompressed_page_size()), pageHeader.getPageHeader().getDictionary_page_header().getNum_values(), CONVERTER.getEncoding(pageHeader.getPageHeader().getDictionary_page_header().getEncoding())); return dictionaryPage.getEncoding().initDictionary(column, dictionaryPage); }
/** * copies the input into a new byte array * @param bytesInput a BytesInput * @return a copy of the BytesInput * @throws IOException if there is an exception when reading bytes from the BytesInput */ public static BytesInput copy(BytesInput bytesInput) throws IOException { return from(bytesInput.toByteArray()); }
/** * @return the bytes representing the packed values * @throws IOException if there is an exception while creating the BytesInput */ public BytesInput toBytes() throws IOException { int packedByteLength = packedPosition + BytesUtils.paddedByteCountFromBits(inputSize * bitWidth); LOG.debug("writing {} bytes", (totalFullSlabSize + packedByteLength)); if (inputSize > 0) { for (int i = inputSize; i < input.length; i++) { input[i] = 0; } pack(); } return concat(concat(slabs), BytesInput.from(packed, 0, packedByteLength)); }
final ByteBuffer dictionaryBytes = dictionaryPage.getBytes().toByteBuffer(); binaryDictionaryContent = new Binary[dictionaryPage.getDictionarySize()]; int len = readIntLittleEndian(dictionaryBytes, offset);
/** * Compress a given buffer of bytes * @param bytes * @return * @throws IOException */ @Override public BytesInput compress(BytesInput bytes) throws IOException { int maxOutputSize = Snappy.maxCompressedLength((int) bytes.size()); ByteBuffer bufferIn = bytes.toByteBuffer(); outgoing = ensure(outgoing, maxOutputSize); final int size; if (bufferIn.isDirect()) { size = Snappy.compress(bufferIn, outgoing); } else { // Snappy library requires buffers be direct this.incoming = ensure(this.incoming, (int) bytes.size()); this.incoming.put(bufferIn); this.incoming.flip(); size = Snappy.compress(this.incoming, outgoing); } outgoing.limit(size); return BytesInput.from(outgoing); }
if (dictionaryPage == null) { dictionaryPage = new DictionaryPage( decompressor.decompress(BytesInput.from(in, pageHeader.compressed_page_size), pageHeader.getUncompressed_page_size()), pageHeader.uncompressed_page_size, parquetMetadataConverter.getEncoding(pageHeader.dictionary_page_header.encoding) buffer.flip(); return new DataPageV1( decompressor.decompress(BytesInput.from(buffer), pageHeader.getUncompressed_page_size()), pageHeader.data_page_header.num_values, pageHeader.uncompressed_page_size, BytesInput decompressedPageData = decompressor.decompress( BytesInput.from(buffer), pageHeader.uncompressed_page_size); ByteBuffer byteBuffer = decompressedPageData.toByteBuffer(); int limit = byteBuffer.limit(); byteBuffer.limit(dataHeaderV2.getRepetition_levels_byte_length()); BytesInput repetitionLevels = BytesInput.from(byteBuffer.slice()); byteBuffer.position(dataHeaderV2.getRepetition_levels_byte_length()); byteBuffer.limit(dataHeaderV2.getRepetition_levels_byte_length() + dataHeaderV2.getDefinition_levels_byte_length()); BytesInput definitionLevels = BytesInput.from(byteBuffer.slice()); byteBuffer.position(dataHeaderV2.getRepetition_levels_byte_length() + dataHeaderV2.getDefinition_levels_byte_length()); byteBuffer.limit(limit); BytesInput data = BytesInput.from(byteBuffer.slice());
/** * getBytes will trigger flushing block buffer, DO NOT write after getBytes() is called without calling reset() * * @return a BytesInput that contains the encoded page data */ @Override public BytesInput getBytes() { // The Page Header should include: blockSizeInValues, numberOfMiniBlocks, totalValueCount if (deltaValuesToFlush != 0) { flushBlockBuffer(); } return BytesInput.concat( config.toBytesInput(), BytesInput.fromUnsignedVarInt(totalValueCount), BytesInput.fromZigZagVarLong(firstValue), BytesInput.from(baos)); }
/** * getBytes will trigger flushing block buffer, DO NOT write after getBytes() is called without calling reset() * * @return a BytesInput that contains the encoded page data */ @Override public BytesInput getBytes() { // The Page Header should include: blockSizeInValues, numberOfMiniBlocks, totalValueCount if (deltaValuesToFlush != 0) { flushBlockBuffer(); } return BytesInput.concat( config.toBytesInput(), BytesInput.fromUnsignedVarInt(totalValueCount), BytesInput.fromZigZagVarInt(firstValue), BytesInput.from(baos)); }
private DictionaryPage readDictionaryPageHelper(PageHeader pageHeader) throws IOException { ByteBuffer data = uncompressPage(pageHeader, false); return new DictionaryPage( BytesInput.from(data, 0, pageHeader.uncompressed_page_size), pageHeader.getDictionary_page_header().getNum_values(), parquetMetadataConverter.getEncoding(pageHeader.dictionary_page_header.encoding) ); }
@Override public BytesInput decompress(BytesInput bytes, int uncompressedSize) throws IOException { final BytesInput decompressed; if (codec != null) { decompressor.reset(); InputStream is = codec.createInputStream(bytes.toInputStream(), decompressor); decompressed = BytesInput.from(is, uncompressedSize); } else { decompressed = bytes; } return decompressed; }
@Override public void decompress(ByteBuffer input, int compressedSize, ByteBuffer output, int uncompressedSize) throws IOException { ByteBuffer decompressed = decompress(BytesInput.from(input), uncompressedSize).toByteBuffer(); output.put(decompressed); }