private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName) { try { ByteArrayInputStream inputStream = new ByteArrayInputStream(data); PageHeader pageHeader = Util.readPageHeader(inputStream); if (pageHeader.type != PageType.DICTIONARY_PAGE) { return Optional.empty(); } Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size()); DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name())); int dictionarySize = dicHeader.getNum_values(); return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding)); } catch (IOException ignored) { return Optional.empty(); } }
private long readDataPageV1(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize, List<DataPage> pages) { DataPageHeader dataHeaderV1 = pageHeader.getData_page_header(); pages.add(new DataPageV1( getSlice(compressedPageSize), dataHeaderV1.getNum_values(), uncompressedPageSize, MetadataReader.readStats( dataHeaderV1.getStatistics(), descriptor.getColumnDescriptor().getType()), getParquetEncoding(Encoding.valueOf(dataHeaderV1.getRepetition_level_encoding().name())), getParquetEncoding(Encoding.valueOf(dataHeaderV1.getDefinition_level_encoding().name())), getParquetEncoding(Encoding.valueOf(dataHeaderV1.getEncoding().name())))); return dataHeaderV1.getNum_values(); }
private long readDataPageV2(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize, List<DataPage> pages) { DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2(); int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length(); pages.add(new DataPageV2( dataHeaderV2.getNum_rows(), dataHeaderV2.getNum_nulls(), dataHeaderV2.getNum_values(), getSlice(dataHeaderV2.getRepetition_levels_byte_length()), getSlice(dataHeaderV2.getDefinition_levels_byte_length()), getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())), getSlice(dataSize), uncompressedPageSize, MetadataReader.readStats( dataHeaderV2.getStatistics(), descriptor.getColumnDescriptor().getType()), dataHeaderV2.isIs_compressed())); return dataHeaderV2.getNum_values(); } }
public PageReader readAllPages() throws IOException { List<DataPage> pages = new ArrayList<>(); DictionaryPage dictionaryPage = null; long valueCount = 0; while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); switch (pageHeader.type) { case DICTIONARY_PAGE: if (dictionaryPage != null) { throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor()); } dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize); break; case DATA_PAGE: valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages); break; case DATA_PAGE_V2: valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages); break; default: skip(compressedPageSize); break; } } return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage); }
public Object getFieldValue(_Fields field) { switch (field) { case TYPE: return getType(); case UNCOMPRESSED_PAGE_SIZE: return new Integer(getUncompressed_page_size()); case COMPRESSED_PAGE_SIZE: return new Integer(getCompressed_page_size()); case CRC: return new Integer(getCrc()); case DATA_PAGE_HEADER: return getData_page_header(); case INDEX_PAGE_HEADER: return getIndex_page_header(); case DICTIONARY_PAGE_HEADER: return getDictionary_page_header(); case DATA_PAGE_HEADER_V2: return getData_page_header_v2(); } throw new IllegalStateException(); }
while (valuesCountReadSoFar < descriptor.metadata.getValueCount()) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); switch (pageHeader.type) { case DICTIONARY_PAGE: DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); dictionaryPage = new DictionaryPage( break; case DATA_PAGE: DataPageHeader dataHeaderV1 = pageHeader.getData_page_header(); pagesInChunk.add( new DataPageV1( break; case DATA_PAGE_V2: DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2(); int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length(); pagesInChunk.add( this.readAsBytesInput(dataSize), uncompressedPageSize, fromParquetStatistics(dataHeaderV2.getStatistics(), descriptor.col.getType()), dataHeaderV2.isIs_compressed() ));
private PageHeader newDataPageHeader( int uncompressedSize, int compressedSize, int valueCount, parquet.column.statistics.Statistics statistics, parquet.column.Encoding rlEncoding, parquet.column.Encoding dlEncoding, parquet.column.Encoding valuesEncoding) { PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE, uncompressedSize, compressedSize); // TODO: pageHeader.crc = ...; pageHeader.setData_page_header(new DataPageHeader( valueCount, getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding))); if (!statistics.isEmpty()) { pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); } return pageHeader; }
public void writeDictionaryPageHeader( int uncompressedSize, int compressedSize, int valueCount, parquet.column.Encoding valuesEncoding, OutputStream to) throws IOException { PageHeader pageHeader = new PageHeader(PageType.DICTIONARY_PAGE, uncompressedSize, compressedSize); pageHeader.setDictionary_page_header(new DictionaryPageHeader(valueCount, getEncoding(valuesEncoding))); writePageHeader(pageHeader, to); }
public class PageContent extends .... { private PageHeader header; public PageContent() { this.header = new PageHeader() } // add setHeader/getHeader methods here. }
private PageHeader newDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, parquet.column.statistics.Statistics<?> statistics, parquet.column.Encoding dataEncoding, int rlByteLength, int dlByteLength) { // TODO: pageHeader.crc = ...; DataPageHeaderV2 dataPageHeaderV2 = new DataPageHeaderV2( valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); if (!statistics.isEmpty()) { dataPageHeaderV2.setStatistics(toParquetStatistics(statistics)); } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); return pageHeader; }
private DictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize) { DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); return new DictionaryPage( getSlice(compressedPageSize), uncompressedPageSize, dicHeader.getNum_values(), getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()))); }
@Override public boolean equals(Object that) { if (that == null) return false; if (that instanceof PageHeader) return this.equals((PageHeader)that); return false; }
public Object getFieldValue(_Fields field) { switch (field) { case TYPE: return getType(); case UNCOMPRESSED_PAGE_SIZE: return new Integer(getUncompressed_page_size()); case COMPRESSED_PAGE_SIZE: return new Integer(getCompressed_page_size()); case CRC: return new Integer(getCrc()); case DATA_PAGE_HEADER: return getData_page_header(); case INDEX_PAGE_HEADER: return getIndex_page_header(); case DICTIONARY_PAGE_HEADER: return getDictionary_page_header(); case DATA_PAGE_HEADER_V2: return getData_page_header_v2(); } throw new IllegalStateException(); }
while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); switch (pageHeader.type) { case DICTIONARY_PAGE:
public PageHeader deepCopy() { return new PageHeader(this); }
private DictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize) throws IOException { DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); return new DictionaryPage( getBytesInput(compressedPageSize), uncompressedPageSize, dicHeader.getNum_values(), Encoding.valueOf(dicHeader.getEncoding().name())); }
@Override public boolean equals(Object that) { if (that == null) return false; if (that instanceof PageHeader) return this.equals((PageHeader)that); return false; }
private static DictionaryPage readDictionaryPage(byte[] data, ParquetCodecFactory codecFactory, CompressionCodecName codecName) { try { ByteArrayInputStream inputStream = new ByteArrayInputStream(data); PageHeader pageHeader = Util.readPageHeader(inputStream); if (pageHeader.type != PageType.DICTIONARY_PAGE) { return null; } // todo this wrapper is not needed BytesInput compressedData = BytesInput.from(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size()); BytesDecompressor decompressor = codecFactory.getDecompressor(codecName); BytesInput decompressed = decompressor.decompress(compressedData, pageHeader.getUncompressed_page_size()); DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header(); Encoding encoding = Encoding.valueOf(dicHeader.getEncoding().name()); int dictionarySize = dicHeader.getNum_values(); return new DictionaryPage(decompressed, dictionarySize, encoding); } catch (IOException ignored) { return null; } }
public PageHeader deepCopy() { return new PageHeader(this); }
private long readDataPageV1(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize, List<DataPage> pages) throws IOException { DataPageHeader dataHeaderV1 = pageHeader.getData_page_header(); pages.add(new DataPageV1( getBytesInput(compressedPageSize), dataHeaderV1.getNum_values(), uncompressedPageSize, ParquetMetadataReader.readStats( dataHeaderV1.getStatistics(), descriptor.getColumnDescriptor().getType()), Encoding.valueOf(dataHeaderV1.getRepetition_level_encoding().name()), Encoding.valueOf(dataHeaderV1.getDefinition_level_encoding().name()), Encoding.valueOf(dataHeaderV1.getEncoding().name()))); return dataHeaderV1.getNum_values(); }