Refine search
private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) { try { if (maxLevel == 0) { return new NullIntIterator(); } return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( BytesUtils.getWidthFromMaxInt(maxLevel), new ByteArrayInputStream(bytes.toByteArray()))); } catch (IOException e) { throw new ParquetDecodingException("could not read levels in page for col " + descriptor, e); } }
/** * Initializes the internal state for decoding ints of `bitWidth`. */ private void init(int bitWidth) { Preconditions.checkArgument(bitWidth >= 0 && bitWidth <= 32, "bitWidth must be >= 0 and <= 32"); this.bitWidth = bitWidth; this.bytesWidth = BytesUtils.paddedByteCountFromBits(bitWidth); this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); }
- ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length; stream.seek(footerLengthIndex); int footerLength = BytesUtils.readIntLittleEndian(stream); stream.seek(footerLengthIndex - footerLength); if (LOG.isInfoEnabled()) {
/** * Creates a reader for definition and repetition levels, returning an optimized one if * the levels are not needed. */ protected static IntIterator createRLEIterator( int maxLevel, BytesInput bytes, ColumnDescriptor descriptor) throws IOException { try { if (maxLevel == 0) return new NullIntIterator(); return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( BytesUtils.getWidthFromMaxInt(maxLevel), bytes.toInputStream())); } catch (IOException e) { throw new IOException("could not read levels in page for col " + descriptor, e); } }
@Override public BytesInput getBytes() { int maxDicId = getDictionarySize() - 1; LOG.debug("max dic id {}", maxDicId); int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId); int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10); RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator); encoders.add(encoder); IntIterator iterator = encodedValues.iterator(); try { while (iterator.hasNext()) { encoder.writeInt(iterator.next()); } // encodes the bit width byte[] bytesHeader = new byte[] { (byte) bitWidth }; BytesInput rleEncodedBytes = encoder.toBytes(); LOG.debug("rle encoded bytes {}", rleEncodedBytes.size()); BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes); // remember size of dictionary when we last wrote a page lastUsedDictionarySize = getDictionarySize(); lastUsedDictionaryByteSize = dictionaryByteSize; return bytes; } catch (IOException e) { throw new ParquetEncodingException("could not encode the values", e); } }
/** * @param bound the maximum value stored by this column */ public BitPackingValuesReader(int bound) { this.bitsPerValue = getWidthFromMaxInt(bound); }
public static int readIntLittleEndianPaddedOnBitWidth(InputStream in, int bitWidth) throws IOException { int bytesWidth = paddedByteCountFromBits(bitWidth); switch (bytesWidth) { case 0: return 0; case 1: return BytesUtils.readIntLittleEndianOnOneByte(in); case 2: return BytesUtils.readIntLittleEndianOnTwoBytes(in); case 3: return BytesUtils.readIntLittleEndianOnThreeBytes(in); case 4: return BytesUtils.readIntLittleEndian(in); default: throw new IOException( String.format("Encountered bitWidth (%d) that requires more than 4 bytes", bitWidth)); } }
/** * @return the bytes representing the packed values * @throws IOException if there is an exception while creating the BytesInput */ public BytesInput toBytes() throws IOException { int packedByteLength = packedPosition + BytesUtils.paddedByteCountFromBits(inputSize * bitWidth); LOG.debug("writing {} bytes", (totalFullSlabSize + packedByteLength)); if (inputSize > 0) { for (int i = inputSize; i < input.length; i++) { input[i] = 0; } pack(); } return concat(concat(slabs), BytesInput.from(packed, 0, packedByteLength)); }
throws IOException { int bytesWidth = paddedByteCountFromBits(bitWidth); switch (bytesWidth) { case 0: break; case 1: writeIntLittleEndianOnOneByte(out, v); break; case 2: writeIntLittleEndianOnTwoBytes(out, v); break; case 3: writeIntLittleEndianOnThreeBytes(out, v); break; case 4: writeIntLittleEndian(out, v); break; default:
final ByteBuffer dictionaryBytes = dictionaryPage.getBytes().toByteBuffer(); binaryDictionaryContent = new Binary[dictionaryPage.getDictionarySize()]; int len = readIntLittleEndian(dictionaryBytes, offset);
@Override public void initFromPage(int valueCount, byte[] page, int offset) throws IOException { checkArgument(page.length > offset, "Attempt to read offset not in the page"); ByteArrayInputStream in = new ByteArrayInputStream(page, offset, page.length - offset); int bitWidth = BytesUtils.readIntLittleEndianOnOneByte(in); decoder = new RunLengthBitPackingHybridDecoder(bitWidth, in); }
@Override public void writeAllTo(OutputStream out) throws IOException { BytesUtils.writeUnsignedVarInt(intValue, out); }
private void writeRleRun() throws IOException { // we may have been working on a bit-packed-run // so close that run if it exists before writing this // rle-run endPreviousBitPackedRun(); // write the rle-header (lsb of 0 signifies a rle run) BytesUtils.writeUnsignedVarInt(repeatCount << 1, baos); // write the repeated-value BytesUtils.writeIntLittleEndianPaddedOnBitWidth(baos, previousValue, bitWidth); // reset the repeat count repeatCount = 0; // throw away all the buffered values, they were just repeats and they've been written numBufferedValues = 0; }
/** * eagerly loads all the data into memory */ @Override public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException { this.in = stream; this.config = DeltaBinaryPackingConfig.readConfig(in); this.totalValueCount = BytesUtils.readUnsignedVarInt(in); allocateValuesBuffer(); bitWidths = new int[config.miniBlockNumInABlock]; //read first value from header valuesBuffer[valuesBuffered++] = BytesUtils.readZigZagVarLong(in); while (valuesBuffered < totalValueCount) { //values Buffered could be more than totalValueCount, since we flush on a mini block basis loadNewBlockToBuffer(); } }
private void readNext() throws IOException { Preconditions.checkArgument(in.available() > 0, "Reading past RLE/BitPacking stream."); final int header = BytesUtils.readUnsignedVarInt(in); mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED; switch (mode) { case RLE: currentCount = header >>> 1; LOG.debug("reading {} values RLE", currentCount); currentValue = BytesUtils.readIntLittleEndianPaddedOnBitWidth(in, bitWidth); break; case PACKED: int numGroups = header >>> 1; currentCount = numGroups * 8; LOG.debug("reading {} values BIT PACKED", currentCount); currentBuffer = new int[currentCount]; // TODO: reuse a buffer byte[] bytes = new byte[numGroups * bitWidth]; // At the end of the file RLE data though, there might not be that many bytes left. int bytesToRead = (int)Math.ceil(currentCount * bitWidth / 8.0); bytesToRead = Math.min(bytesToRead, in.available()); new DataInputStream(in).readFully(bytes, 0, bytesToRead); for (int valueIndex = 0, byteIndex = 0; valueIndex < currentCount; valueIndex += 8, byteIndex += bitWidth) { packer.unpack8Values(bytes, byteIndex, currentBuffer, valueIndex); } break; default: throw new ParquetDecodingException("not a valid mode " + mode); } } }
/** * uses a trick mentioned in https://developers.google.com/protocol-buffers/docs/encoding to read zigZag encoded data * @param in an input stream * @return the value of a zig-zag varint read from the current position in the stream * @throws IOException if there is an exception while reading */ public static int readZigZagVarInt(InputStream in) throws IOException { int raw = readUnsignedVarInt(in); int temp = (((raw << 31) >> 31) ^ raw) >> 1; return temp ^ (raw & (1 << 31)); }
@Override public void writeAllTo(OutputStream out) throws IOException { BytesUtils.writeIntLittleEndian(out, intValue); }
protected void writeBitWidthForMiniBlock(int i) { try { BytesUtils.writeIntLittleEndianOnOneByte(baos, bitWidths[i]); } catch (IOException e) { throw new ParquetEncodingException("can not write bitwith for miniblock", e); } }
/** * Creates a reader for definition and repetition levels, returning an optimized one if * the levels are not needed. */ protected static IntIterator createRLEIterator( int maxLevel, BytesInput bytes, ColumnDescriptor descriptor) throws IOException { try { if (maxLevel == 0) return new NullIntIterator(); return new RLEIntIterator( new RunLengthBitPackingHybridDecoder( BytesUtils.getWidthFromMaxInt(maxLevel), bytes.toInputStream())); } catch (IOException e) { throw new IOException("could not read levels in page for col " + descriptor, e); } }
@Override public BytesInput getBytes() { int maxDicId = getDictionarySize() - 1; LOG.debug("max dic id {}", maxDicId); int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId); int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10); RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator); encoders.add(encoder); IntIterator iterator = encodedValues.iterator(); try { while (iterator.hasNext()) { encoder.writeInt(iterator.next()); } // encodes the bit width byte[] bytesHeader = new byte[] { (byte) bitWidth }; BytesInput rleEncodedBytes = encoder.toBytes(); LOG.debug("rle encoded bytes {}", rleEncodedBytes.size()); BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes); // remember size of dictionary when we last wrote a page lastUsedDictionarySize = getDictionarySize(); lastUsedDictionaryByteSize = dictionaryByteSize; return bytes; } catch (IOException e) { throw new ParquetEncodingException("could not encode the values", e); } }