PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
/** * Load Decision Tree model. * * @param pathToMdl Path to model. */ private static Model loadDecisionTreeModel(String pathToMdl) { try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, NodeData> nodes = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); NodeData nodeData = extractNodeDataFromParquetRow(g); nodes.put(nodeData.id, nodeData); } } return buildDecisionTreeModel(nodes); } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return null; }
final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
/** * Load logistic regression model. * * @param pathToMdl Path to model. */ private static Model loadLogRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readInterceptor(g); coefficients = readCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LogisticRegressionModel(coefficients, interceptor); }
/** * Load SVM model. * * @param pathToMdl Path to model. */ private static Model loadLinearSVMModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readSVMInterceptor(g); coefficients = readSVMCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new SVMLinearClassificationModel(coefficients, interceptor); }
/** * Load linear regression model. * * @param pathToMdl Path to model. */ private static Model loadLinRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readLinRegInterceptor(g); coefficients = readLinRegCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LinearRegressionModel(coefficients, interceptor); }
MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.totalRowCount += block.getRowCount(); this.fileSchema = footer.getFileMetaData().getSchema(); configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
private static Model loadKMeansModel(String pathToMdl) { Vector[] centers = null; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final int rows = (int)pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); centers = new DenseVector[rows]; for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); // final int clusterIdx = g.getInteger(0, 0); Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0); centers[i] = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0); centers[i].set(j, coefficient); } } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new KMeansModel(centers, new EuclideanDistance()); }
this.fileSchema = footer.getFileMetaData().getSchema(); config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : blocks) { this.totalRowCount += block.getRowCount();
this.fileSchema = footer.getFileMetaData().getSchema(); config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
this.fileSchema = footer.getFileMetaData().getSchema(); config, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData(); ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns()); for (BlockMetaData block : blocks) { this.totalRowCount += block.getRowCount();
MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData(); ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); this.fileSchema = footer.getFileMetaData().getSchema(); Map<String, String> fileMetadata = footer.getFileMetaData().getKeyValueMetaData(); ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration)); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString); this.reader = new ParquetFileReader( configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
private Schema parseArrowSchema() throws ExecutionSetupException { try { Schema arrowSchema = DremioArrowSchema.fromMetaData(footer.getFileMetaData().getKeyValueMetaData()); return arrowSchema; } catch (IOException e) { logger.warn("Invalid Arrow Schema", e); } return null; }
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) { GlobalMetaData fileMetaData = null; for (Footer footer : footers) { ParquetMetadata currentMetadata = footer.getParquetMetadata(); fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict); } return fileMetaData; }
/** * Get the schema of the given parquet file. */ public static MessageType readSchema(Configuration configuration, Path parquetFilePath) { return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema(); }
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException { this.converter = new ParquetMetadataConverter(options); this.file = file; this.f = file.newStream(); this.options = options; this.footer = readFooter(file, options, f, converter); this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } }
public MessageType readParquetSchema() { try { ParquetMetadata parquetMetadata = getParquetMetadata(); return parquetMetadata.getFileMetaData().getSchema(); } finally { close(); } }