public BaseVectorizedColumnReader( ColumnDescriptor descriptor, PageReader pageReader, boolean skipTimestampConversion, Type parquetType, TypeInfo hiveType) throws IOException { this.descriptor = descriptor; this.type = parquetType; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); this.skipTimestampConversion = skipTimestampConversion; this.hiveType = hiveType; DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = ParquetDataColumnReaderFactory .getDataColumnReaderByTypeOnDictionary(parquetType.asPrimitiveType(), hiveType, dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } }
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) throws IOException { this.pageValueCount = valueCount; this.endOfPageValueCount = valuesRead + pageValueCount; if (dataEncoding.usesDictionary()) { this.dataColumn = null; if (dictionary == null) { throw new IOException( "could not read page in col " + descriptor + " as the dictionary was missing for encoding " + dataEncoding); } dataColumn = ParquetDataColumnReaderFactory.getDataColumnReaderByType(type.asPrimitiveType(), hiveType, dataEncoding.getDictionaryBasedValuesReader(descriptor, VALUES, dictionary .getDictionary()), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = true; } else { dataColumn = ParquetDataColumnReaderFactory.getDataColumnReaderByType(type.asPrimitiveType(), hiveType, dataEncoding.getValuesReader(descriptor, VALUES), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = false; } try { dataColumn.initFromPage(pageValueCount, in); } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } }
private void readPageV1(DataPageV1 page) { ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL); this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); try { BytesInput bytes = page.getBytes(); LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records"); ByteBufferInputStream in = bytes.toInputStream(); LOG.debug("reading repetition levels at " + in.position()); rlReader.initFromPage(pageValueCount, in); LOG.debug("reading definition levels at " + in.position()); dlReader.initFromPage(pageValueCount, in); LOG.debug("reading data at " + in.position()); initDataReader(page.getValueEncoding(), in, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } }
private void readDictionaryPage(final PageHeader pageHeader, final ColumnReader<?> parentStatus) throws IOException { int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); final DrillBuf dictionaryData = readPage(pageHeader, compressedSize, uncompressedSize); allocatedDictionaryBuffers.add(dictionaryData); DictionaryPage page = new DictionaryPage( asBytesInput(dictionaryData, 0, uncompressedSize), pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values, valueOf(pageHeader.dictionary_page_header.encoding.name())); this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page); }
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in) throws IOException { this.endOfPageValueCount = valuesRead + pageValueCount; if (dataEncoding.usesDictionary()) { this.dataColumn = null; if (dictionary == null) { throw new IOException( "could not read page in col " + descriptor + " as the dictionary was missing for encoding " + dataEncoding); } @SuppressWarnings("deprecation") Encoding plainDict = Encoding.PLAIN_DICTIONARY; // var to allow warning suppression if (dataEncoding != plainDict && dataEncoding != Encoding.RLE_DICTIONARY) { throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); } this.dataColumn = new VectorizedRleValuesReader(); this.isCurrentPageDictionaryEncoded = true; } else { if (dataEncoding != Encoding.PLAIN) { throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); } this.dataColumn = new VectorizedPlainValuesReader(); this.isCurrentPageDictionaryEncoded = false; } try { dataColumn.initFromPage(pageValueCount, in); } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } }
repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL); repetitionLevels.initFromPage(currentPageCount, in); definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL); definitionLevels.initFromPage(currentPageCount, in); readPosInBytes = in.position(); if (!valueEncoding.usesDictionary()) { valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, in); valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, in); if (valueEncoding.usesDictionary()) {
private static Set<org.apache.parquet.column.Encoding> readEncodings(List<Encoding> encodings) { Set<org.apache.parquet.column.Encoding> columnEncodings = new HashSet<>(); for (Encoding encoding : encodings) { columnEncodings.add(org.apache.parquet.column.Encoding.valueOf(encoding.name())); } return Collections.unmodifiableSet(columnEncodings); }
/** * To read decoded values that don't require a dictionary * * @param descriptor the column to read * @param valuesType the type of values * @return the proper values reader for the given column * @throws UnsupportedOperationException if the encoding is dictionary based */ public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { throw new UnsupportedOperationException("Error decoding " + descriptor + ". " + this.name() + " is dictionary based"); }
@Override public ValuesReader getDictionaryBasedValuesReader(ColumnDescriptor descriptor, ValuesType valuesType, Dictionary dictionary) { return RLE_DICTIONARY.getDictionaryBasedValuesReader(descriptor, valuesType, dictionary); }
private void readPageV1(DataPageV1 page) throws IOException { this.pageValueCount = page.getValueCount(); ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); ValuesReader dlReader; // Initialize the decoders. if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); } int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); this.defColumn = new VectorizedRleValuesReader(bitWidth); dlReader = this.defColumn; this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); try { BytesInput bytes = page.getBytes(); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(pageValueCount, in); dlReader.initFromPage(pageValueCount, in); initDataReader(page.getValueEncoding(), in); } catch (IOException e) { throw new IOException("could not read page " + page + " in col " + descriptor, e); } }
private void readDictionaryPage(final PageHeader pageHeader, final ColumnReader<?> parentStatus) throws IOException { int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); final ArrowBuf dictionaryData = allocateDictionaryBuffer(uncompressedSize); readPage(pageHeader, compressedSize, uncompressedSize, dictionaryData); DictionaryPage page = new DictionaryPage( asBytesInput(dictionaryData, 0, uncompressedSize), pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values, valueOf(pageHeader.dictionary_page_header.encoding.name())); this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page); }
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in) throws IOException { this.endOfPageValueCount = valuesRead + pageValueCount; if (dataEncoding.usesDictionary()) { this.dataColumn = null; if (dictionary == null) { throw new IOException( "could not read page in col " + descriptor + " as the dictionary was missing for encoding " + dataEncoding); } @SuppressWarnings("deprecation") Encoding plainDict = Encoding.PLAIN_DICTIONARY; // var to allow warning suppression if (dataEncoding != plainDict && dataEncoding != Encoding.RLE_DICTIONARY) { throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); } this.dataColumn = new VectorizedRleValuesReader(); this.isCurrentPageDictionaryEncoded = true; } else { if (dataEncoding != Encoding.PLAIN) { throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding); } this.dataColumn = new VectorizedPlainValuesReader(); this.isCurrentPageDictionaryEncoded = false; } try { dataColumn.initFromPage(pageValueCount, in); } catch (IOException e) { throw new IOException("could not read page in col " + descriptor, e); } }
repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL); repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL); definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); readPosInBytes = definitionLevels.getNextOffset(); if (!valueEncoding.usesDictionary()) { valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES); valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes); if (valueEncoding.usesDictionary()) {
private static Set<org.apache.parquet.column.Encoding> readEncodings(List<Encoding> encodings) { Set<org.apache.parquet.column.Encoding> columnEncodings = new HashSet<>(); for (Encoding encoding : encodings) { columnEncodings.add(org.apache.parquet.column.Encoding.valueOf(encoding.name())); } return Collections.unmodifiableSet(columnEncodings); }
/** * To read decoded values that require a dictionary * * @param descriptor the column to read * @param valuesType the type of values * @param dictionary the dictionary * @return the proper values reader for the given column * @throws UnsupportedOperationException if the encoding is not dictionary based */ public ValuesReader getDictionaryBasedValuesReader(ColumnDescriptor descriptor, ValuesType valuesType, Dictionary dictionary) { throw new UnsupportedOperationException(this.name() + " is not dictionary based"); }
@Override public ValuesReader getDictionaryBasedValuesReader(ColumnDescriptor descriptor, ValuesType valuesType, Dictionary dictionary) { return RLE_DICTIONARY.getDictionaryBasedValuesReader(descriptor, valuesType, dictionary); }
if (dataEncoding.usesDictionary()) { if (dict == null) { throw new ParquetDecodingException( "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); this.values = dataEncoding.getDictionaryBasedValuesReader(desc, VALUES, dict); } else { this.values = dataEncoding.getValuesReader(desc, VALUES);
public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } this.totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount == 0) { throw new IOException("totalValueCount == 0"); } }
private void readPageV1(DataPageV1 page) throws IOException { this.pageValueCount = page.getValueCount(); ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); ValuesReader dlReader; // Initialize the decoders. if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); } int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); this.defColumn = new VectorizedRleValuesReader(bitWidth); dlReader = this.defColumn; this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); try { BytesInput bytes = page.getBytes(); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(pageValueCount, in); dlReader.initFromPage(pageValueCount, in); initDataReader(page.getValueEncoding(), in); } catch (IOException e) { throw new IOException("could not read page " + page + " in col " + descriptor, e); } }
private void readDictionaryPageData(final ReadStatus readStatus, final ColumnReader<?> parentStatus) throws UserException { try { pageHeader = readStatus.getPageHeader(); int uncompressedSize = pageHeader.getUncompressed_page_size(); final DrillBuf dictionaryData = getDecompressedPageData(readStatus); Stopwatch timer = Stopwatch.createStarted(); allocatedDictionaryBuffers.add(dictionaryData); DictionaryPage page = new DictionaryPage(asBytesInput(dictionaryData, 0, uncompressedSize), pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values, valueOf(pageHeader.dictionary_page_header.encoding.name())); this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page); long timeToDecode = timer.elapsed(TimeUnit.NANOSECONDS); stats.timeDictPageDecode.addAndGet(timeToDecode); } catch (Exception e) { handleAndThrowException(e, "Error decoding dictionary page."); } }