final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
/** * Load Decision Tree model. * * @param pathToMdl Path to model. */ private static Model loadDecisionTreeModel(String pathToMdl) { try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, NodeData> nodes = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); NodeData nodeData = extractNodeDataFromParquetRow(g); nodes.put(nodeData.id, nodeData); } } return buildDecisionTreeModel(nodes); } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return null; }
PageReadStore pagesMetaData; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) {
/** * Load SVM model. * * @param pathToMdl Path to model. */ private static Model loadLinearSVMModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readSVMInterceptor(g); coefficients = readSVMCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new SVMLinearClassificationModel(coefficients, interceptor); }
/** * Load linear regression model. * * @param pathToMdl Path to model. */ private static Model loadLinRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readLinRegInterceptor(g); coefficients = readLinRegCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LinearRegressionModel(coefficients, interceptor); }
/** * Load logistic regression model. * * @param pathToMdl Path to model. */ private static Model loadLogRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readInterceptor(g); coefficients = readCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LogisticRegressionModel(coefficients, interceptor); }
private static Model loadKMeansModel(String pathToMdl) { Vector[] centers = null; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final int rows = (int)pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); centers = new DenseVector[rows]; for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); // final int clusterIdx = g.getInteger(0, 0); Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0); centers[i] = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0); centers[i].set(j, coefficient); } } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new KMeansModel(centers, new EuclideanDistance()); }
public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true); }
public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true); }
@Override public void prepareForWrite(RecordConsumer recordConsumer) { final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct); }
public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves(); }
public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves(); }
@Override public void prepareForWrite(RecordConsumer recordConsumer) { final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct); thriftWriteSupport.prepareForWrite(recordConsumer); }
private void initStore() { pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator()); columnStore = props.newColumnWriteStore(schema, pageStore); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); this.recordConsumer = columnIO.getRecordWriter(columnStore); writeSupport.prepareForWrite(recordConsumer); }
private void newSchema() throws IOException { // Reset it to half of current number and bound it within the limits recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCountForNextMemCheck / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); String json = new Schema(batchSchema).toJson(); extraMetaData.put(DREMIO_ARROW_SCHEMA_2_1, json); List<Type> types = Lists.newArrayList(); for (Field field : batchSchema) { if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) { continue; } Type childType = getType(field); if (childType != null) { types.add(childType); } } Preconditions.checkState(types.size() > 0, "No types for parquet schema"); schema = new MessageType("root", types); int dictionarySize = (int)context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE_VALIDATOR); final ParquetProperties parquetProperties = new ParquetProperties(dictionarySize, writerVersion, enableDictionary, new ParquetDirectByteBufferAllocator(columnEncoderAllocator), pageSize, true, enableDictionaryForBinary); pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, parquetProperties); store = new ColumnWriteStoreV1(pageStore, pageSize, parquetProperties); MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema); consumer = columnIO.getRecordWriter(store); setUp(schema, consumer); }
private void initStore() { pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator(), props.getColumnIndexTruncateLength()); columnStore = props.newColumnWriteStore(schema, pageStore); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); this.recordConsumer = columnIO.getRecordWriter(columnStore); writeSupport.prepareForWrite(recordConsumer); }
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
public void initialize(ParquetFileReader reader, Configuration configuration) throws IOException { // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true); reader.setRequestedSchema(requestedSchema); LOG.info("RecordReader initialized will read a total of {} records.", total); }
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }
public void initialize(FileMetaData parquetFileMetadata, Path file, List<BlockMetaData> blocks, Configuration configuration) throws IOException { // initialize a ReadContext for this file Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext( configuration, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.fileSchema = parquetFileMetadata.getSchema(); this.file = file; this.columnCount = requestedSchema.getPaths().size(); this.recordConverter = readSupport.prepareForRead( configuration, fileMetadata, fileSchema, readContext); this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); List<ColumnDescriptor> columns = requestedSchema.getColumns(); reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns); for (BlockMetaData block : blocks) { total += block.getRowCount(); } this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total); LOG.info("RecordReader initialized will read a total of " + total + " records."); }