final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>();
/** * Load Decision Tree model. * * @param pathToMdl Path to model. */ private static Model loadDecisionTreeModel(String pathToMdl) { try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, NodeData> nodes = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); NodeData nodeData = extractNodeDataFromParquetRow(g); nodes.put(nodeData.id, nodeData); } } return buildDecisionTreeModel(nodes); } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return null; }
PageReadStore pagesMetaData; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); final Map<Integer, TreeMap<Integer, NodeData>> nodesByTreeId = new TreeMap<>(); while (null != (pages = r.readNextRowGroup())) {
/** * Load SVM model. * * @param pathToMdl Path to model. */ private static Model loadLinearSVMModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readSVMInterceptor(g); coefficients = readSVMCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new SVMLinearClassificationModel(coefficients, interceptor); }
/** * Load linear regression model. * * @param pathToMdl Path to model. */ private static Model loadLinRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readLinRegInterceptor(g); coefficients = readLinRegCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LinearRegressionModel(coefficients, interceptor); }
/** * Load logistic regression model. * * @param pathToMdl Path to model. */ private static Model loadLogRegModel(String pathToMdl) { Vector coefficients = null; double interceptor = 0; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); interceptor = readInterceptor(g); coefficients = readCoefficients(g); } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new LogisticRegressionModel(coefficients, interceptor); }
private static Model loadKMeansModel(String pathToMdl) { Vector[] centers = null; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final int rows = (int)pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); centers = new DenseVector[rows]; for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); // final int clusterIdx = g.getInteger(0, 0); Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0); centers[i] = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0); centers[i].set(j, coefficient); } } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new KMeansModel(centers, new EuclideanDistance()); }
/** * @param requestedSchema the requestedSchema we want to read/write * @param fileSchema the file schema (when reading it can be different from the requested schema) * @return the corresponding serializing/deserializing structure */ public MessageColumnIO getColumnIO(MessageType requestedSchema, MessageType fileSchema) { return getColumnIO(requestedSchema, fileSchema, true); }
/** * @param schema the schema we want to read/write * @return the corresponding serializing/deserializing structure */ public MessageColumnIO getColumnIO(MessageType schema) { return this.getColumnIO(schema, schema); }
/** * @param requestedSchema the requestedSchema we want to read/write * @param fileSchema the file schema (when reading it can be different from the requested schema) * @return the corresponding serializing/deserializing structure */ public MessageColumnIO getColumnIO(MessageType requestedSchema, MessageType fileSchema) { return getColumnIO(requestedSchema, fileSchema, true); }
/** * @param schema the schema we want to read/write * @return the corresponding serializing/deserializing structure */ public MessageColumnIO getColumnIO(MessageType schema) { return this.getColumnIO(schema, schema); }
public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true); }
public static MessageColumnIO getColumnIO(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true); }
@Override public void prepareForWrite(RecordConsumer recordConsumer) { final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct); }
public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves(); }
public static List<PrimitiveColumnIO> getColumns(MessageType fileSchema, MessageType requestedSchema) { return (new ColumnIOFactory()).getColumnIO(requestedSchema, fileSchema, true).getLeaves(); }
@Override public void prepareForWrite(RecordConsumer recordConsumer) { final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema); this.parquetWriteProtocol = new ParquetWriteProtocol(recordConsumer, columnIO, thriftStruct); thriftWriteSupport.prepareForWrite(recordConsumer); }
private void newSchema() throws IOException { // Reset it to half of current number and bound it within the limits recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCountForNextMemCheck / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); String json = new Schema(batchSchema).toJson(); extraMetaData.put(DREMIO_ARROW_SCHEMA_2_1, json); List<Type> types = Lists.newArrayList(); for (Field field : batchSchema) { if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) { continue; } Type childType = getType(field); if (childType != null) { types.add(childType); } } Preconditions.checkState(types.size() > 0, "No types for parquet schema"); schema = new MessageType("root", types); int dictionarySize = (int)context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE_VALIDATOR); final ParquetProperties parquetProperties = new ParquetProperties(dictionarySize, writerVersion, enableDictionary, new ParquetDirectByteBufferAllocator(columnEncoderAllocator), pageSize, true, enableDictionaryForBinary); pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, parquetProperties); store = new ColumnWriteStoreV1(pageStore, pageSize, parquetProperties); MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema); consumer = columnIO.getRecordWriter(store); setUp(schema, consumer); }
private void initStore() { pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator()); columnStore = props.newColumnWriteStore(schema, pageStore); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); this.recordConsumer = columnIO.getRecordWriter(columnStore); writeSupport.prepareForWrite(recordConsumer); }
private void initStore() { pageStore = new ColumnChunkPageWriteStore(compressor, schema, props.getAllocator(), props.getColumnIndexTruncateLength()); columnStore = props.newColumnWriteStore(schema, pageStore); MessageColumnIO columnIO = new ColumnIOFactory(validating).getColumnIO(schema); this.recordConsumer = columnIO.getRecordWriter(columnStore); writeSupport.prepareForWrite(recordConsumer); }