protected void readPage() throws IOException { DataPage page = pageReader.readPage(); if (page == null) { return; } page.accept(new DataPage.Visitor<Void>() { @Override public Void visit(DataPageV1 dataPageV1) { readPageV1(dataPageV1); return null; } @Override public Void visit(DataPageV2 dataPageV2) { readPageV2(dataPageV2); return null; } }); }
private void readPageV2(DataPageV2 page) { this.pageValueCount = page.getValueCount(); this.repetitionLevelColumn = newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); try { LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records"); initDataReader(page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } }
public BaseVectorizedColumnReader( ColumnDescriptor descriptor, PageReader pageReader, boolean skipTimestampConversion, Type parquetType, TypeInfo hiveType) throws IOException { this.descriptor = descriptor; this.type = parquetType; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); this.skipTimestampConversion = skipTimestampConversion; this.hiveType = hiveType; DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = ParquetDataColumnReaderFactory .getDataColumnReaderByTypeOnDictionary(parquetType.asPrimitiveType(), hiveType, dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage), skipTimestampConversion); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } }
private void readPageV1(DataPageV1 page) { ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL); this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); try { BytesInput bytes = page.getBytes(); LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records"); ByteBufferInputStream in = bytes.toInputStream(); LOG.debug("reading repetition levels at " + in.position()); rlReader.initFromPage(pageValueCount, in); LOG.debug("reading definition levels at " + in.position()); dlReader.initFromPage(pageValueCount, in); LOG.debug("reading data at " + in.position()); initDataReader(page.getValueEncoding(), in, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e); } }
public VectorizedColumnReader(ColumnDescriptor descriptor, PageReader pageReader) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } this.totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount == 0) { throw new IOException("totalValueCount == 0"); } }
totalCountLoadedSoFar += pages.getRowCount();
private void checkEndOfRowGroup() throws IOException { if (rowsReturned != totalCountLoadedSoFar) return; PageReadStore pages = reader.readNextRowGroup(); if (pages == null) { throw new IOException("expecting more rows but reached last block. Read " + rowsReturned + " out of " + totalRowCount); } List<ColumnDescriptor> columns = requestedSchema.getColumns(); columnReaders = new VectorizedColumnReader[columns.size()]; for (int i = 0; i < columns.size(); ++i) { if (missingColumns[i]) continue; columnReaders[i] = new VectorizedColumnReader(columns.get(i), pages.getPageReader(columns.get(i))); } totalCountLoadedSoFar += pages.getRowCount(); } }
pages.getPageReader(descriptors.get(0)), skipTimestampConversion, type, typeInfo); } else { pages.getPageReader(descriptors.get(0)), skipTimestampConversion, getElementType(type), typeInfo); case MAP: descriptors.get(0), pages.getPageReader(descriptors.get(0)), skipTimestampConversion, kvTypes.get(0), typeInfo); VectorizedListColumnReader valueListColumnReader = new VectorizedListColumnReader( descriptors.get(1), pages.getPageReader(descriptors.get(1)), skipTimestampConversion, kvTypes.get(1), typeInfo); return new VectorizedMapColumnReader(keyListColumnReader, valueListColumnReader);
public VectorizedColumnReader( ColumnDescriptor descriptor, OriginalType originalType, PageReader pageReader, TimeZone convertTz) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.convertTz = convertTz; this.originalType = originalType; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } this.totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount == 0) { throw new IOException("totalValueCount == 0"); } }
private void readPage() throws IOException { DataPage page = pageReader.readPage(); // TODO: Why is this a visitor? page.accept(new DataPage.Visitor<Void>() { @Override public Void visit(DataPageV1 dataPageV1) { try { readPageV1(dataPageV1); return null; } catch (IOException e) { throw new RuntimeException(e); } } @Override public Void visit(DataPageV2 dataPageV2) { try { readPageV2(dataPageV2); return null; } catch (IOException e) { throw new RuntimeException(e); } } }); }
/** * Load Decision Tree model. * * @param pathToMdl Path to model. */ private static Model loadDecisionTreeModel(String pathToMdl) { try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, NodeData> nodes = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); NodeData nodeData = extractNodeDataFromParquetRow(g); nodes.put(nodeData.id, nodeData); } } return buildDecisionTreeModel(nodes); } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return null; }
public VectorizedColumnReader( ColumnDescriptor descriptor, OriginalType originalType, PageReader pageReader, TimeZone convertTz) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.convertTz = convertTz; this.originalType = originalType; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } this.totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount == 0) { throw new IOException("totalValueCount == 0"); } }
private void readPage() { DataPage page = pageReader.readPage(); // TODO: Why is this a visitor? page.accept(new DataPage.Visitor<Void>() { @Override public Void visit(DataPageV1 dataPageV1) { try { readPageV1(dataPageV1); return null; } catch (IOException e) { throw new RuntimeException(e); } } @Override public Void visit(DataPageV2 dataPageV2) { try { readPageV2(dataPageV2); return null; } catch (IOException e) { throw new RuntimeException(e); } } }); }
final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
private void readPage() { DataPage page = pageReader.readPage(); // TODO: Why is this a visitor? page.accept(new DataPage.Visitor<Void>() { @Override public Void visit(DataPageV1 dataPageV1) { try { readPageV1(dataPageV1); return null; } catch (IOException e) { throw new RuntimeException(e); } } @Override public Void visit(DataPageV2 dataPageV2) { try { readPageV2(dataPageV2); return null; } catch (IOException e) { throw new RuntimeException(e); } } }); }
/** * Load logistic regression model. * * @param pathToMdl Path to model. */ private static Model loadLogRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readInterceptor(g); coefficients = readCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LogisticRegressionModel(coefficients, interceptor); }
/** * Load SVM model. * * @param pathToMdl Path to model. */ private static Model loadLinearSVMModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readSVMInterceptor(g); coefficients = readSVMCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new SVMLinearClassificationModel(coefficients, interceptor); }
/** * Load linear regression model. * * @param pathToMdl Path to model. */ private static Model loadLinRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readLinRegInterceptor(g); coefficients = readLinRegCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LinearRegressionModel(coefficients, interceptor); }
final long rows = pagesMetaData.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) {
private static Model loadKMeansModel(String pathToMdl) { Vector[] centers = null; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final int rows = (int)pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); centers = new DenseVector[rows]; for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); // final int clusterIdx = g.getInteger(0, 0); Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0); centers[i] = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0); centers[i].set(j, coefficient); } } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new KMeansModel(centers, new EuclideanDistance()); }