/** * Read interceptor value from parquet. * * @param g Interceptor group. */ private static double readSVMInterceptor(SimpleGroup g) { return g.getDouble(1, 0); }
/** * Read interceptor value from parquet. * * @param g Interceptor group. */ private static double readInterceptor(SimpleGroup g) { double interceptor; final SimpleGroup interceptVector = (SimpleGroup)g.getGroup(2, 0); final SimpleGroup interceptVectorVal = (SimpleGroup)interceptVector.getGroup(3, 0); final SimpleGroup interceptVectorValElement = (SimpleGroup)interceptVectorVal.getGroup(0, 0); interceptor = interceptVectorValElement.getDouble(0, 0); return interceptor; }
/** * Form the node data according data in parquet row. * * @param g The given group presenting the node data from Spark DT model. */ @NotNull private static SparkModelParser.NodeData extractNodeDataFromParquetRow(SimpleGroup g) { NodeData nodeData = new NodeData(); nodeData.id = g.getInteger(0, 0); nodeData.prediction = g.getDouble(1, 0); nodeData.leftChildId = g.getInteger(5, 0); nodeData.rightChildId = g.getInteger(6, 0); if (nodeData.leftChildId == -1 && nodeData.rightChildId == -1) { nodeData.featureIdx = -1; nodeData.threshold = -1; nodeData.isLeafNode = true; } else { final SimpleGroup splitGrp = (SimpleGroup)g.getGroup(7, 0); nodeData.featureIdx = splitGrp.getInteger(0, 0); nodeData.threshold = splitGrp.getGroup(1, 0).getGroup(0, 0).getDouble(0, 0); } return nodeData; }
protected static void writeRepeateListData(ParquetWriter<Group> writer, int elementNum, boolean isNull) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int listMaxSize = 4; for (int i = 0; i < elementNum; i++) { Group group = f.newGroup(); if (!isNull) { for (int j = 0; j < listMaxSize; j++) { group.append("list_int32_field_for_repeat_test", j); } } writer.write(group); } writer.close(); }
final RecordReader recordReader = colIO.getRecordReader(pagesMetaData, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); int treeId = g.getInteger(0, 0); double treeWeight = g.getDouble(2, 0); treeWeightsByTreeID.put(treeId, treeWeight); while (null != (pages = r.readNextRowGroup())) { final long rows = pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); final int treeID = g.getInteger(0, 0); final SimpleGroup nodeDataGroup = (SimpleGroup)g.getGroup(1, 0); NodeData nodeData = extractNodeDataFromParquetRow(nodeDataGroup);
final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); final int treeID = g.getInteger(0, 0); final SimpleGroup nodeDataGroup = (SimpleGroup)g.getGroup(1, 0);
private static Model loadKMeansModel(String pathToMdl) { Vector[] centers = null; try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) { PageReadStore pages; final MessageType schema = r.getFooter().getFileMetaData().getSchema(); final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema); while (null != (pages = r.readNextRowGroup())) { final int rows = (int)pages.getRowCount(); final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema)); centers = new DenseVector[rows]; for (int i = 0; i < rows; i++) { final SimpleGroup g = (SimpleGroup)recordReader.read(); // final int clusterIdx = g.getInteger(0, 0); Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0); centers[i] = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0); centers[i].set(j, coefficient); } } } } catch (IOException e) { System.out.println("Error reading parquet file."); e.printStackTrace(); } return new KMeansModel(centers, new EuclideanDistance()); }
@Override public int getInteger(int fieldIndex, int index) { return ((IntegerValue)getValue(fieldIndex, index)).getInteger(); }
@Override public void add(int fieldIndex, boolean value) { add(fieldIndex, new BooleanValue(value)); }
@Override public void add(int fieldIndex, float value) { add(fieldIndex, new FloatValue(value)); }
/** * Read coefficient matrix from parquet. * * @param g Coefficient group. * @return Vector of coefficients. */ private static Vector readCoefficients(SimpleGroup g) { Vector coefficients; final int amountOfCoefficients = g.getGroup(3, 0).getGroup(5, 0).getFieldRepetitionCount(0); coefficients = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = g.getGroup(3, 0).getGroup(5, 0).getGroup(0, j).getDouble(0, 0); coefficients.set(j, coefficient); } return coefficients; }
protected static void writeRepeateMapData( ParquetWriter<Group> writer, int elementNum, boolean isNull) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int mapMaxSize = 4; for (int i = 0; i < elementNum; i++) { Group group = f.newGroup(); if (!isNull) { for (int j = 0; j < mapMaxSize; j++) { group.addGroup("map_int32_for_repeat_test").append("key", j).append("value", j); } } writer.write(group); } writer.close(); }
/** * Read coefficient matrix from parquet. * * @param g Coefficient group. * @return Vector of coefficients. */ private static Vector readLinRegCoefficients(SimpleGroup g) { Vector coefficients; Group coeffGroup = g.getGroup(1, 0).getGroup(3, 0); final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0); coefficients = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0); coefficients.set(j, coefficient); } return coefficients; }
/** * Read interceptor value from parquet. * * @param g Interceptor group. */ private static double readLinRegInterceptor(SimpleGroup g) { return g.getDouble(0, 0); }
protected static void writeListData(ParquetWriter<Group> writer, boolean isDictionaryEncoding, int elementNum) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int listMaxSize = 4; int listElementIndex = 0; for (int i = 0; i < elementNum; i++) { boolean isNull = isNull(i); Group group = f.newGroup(); int listSize = i % listMaxSize + 1; if (!isNull) { for (int j = 0; j < listSize; j++) { group.append("list_int32_field", getIntValue(isDictionaryEncoding, listElementIndex)); group.append("list_int64_field", getLongValue(isDictionaryEncoding, listElementIndex)); group.append("list_double_field", getDoubleValue(isDictionaryEncoding, listElementIndex)); group.append("list_float_field", getFloatValue(isDictionaryEncoding, listElementIndex)); group.append("list_boolean_field", getBooleanValue(listElementIndex)); group.append("list_binary_field", getBinaryValue(isDictionaryEncoding, listElementIndex)); HiveDecimal hd = getDecimal(isDictionaryEncoding, listElementIndex).setScale(2); HiveDecimalWritable hdw = new HiveDecimalWritable(hd); group.append("list_decimal_field", Binary.fromConstantByteArray(hdw.getInternalStorage())); listElementIndex++; } } for (int j = 0; j < listMaxSize; j++) { group.append("list_binary_field_for_repeat_test", getBinaryValue(isDictionaryEncoding, i)); } writer.write(group); } writer.close(); }
/** * Read coefficient matrix from parquet. * * @param g Coefficient group. * @return Vector of coefficients. */ private static Vector readSVMCoefficients(SimpleGroup g) { Vector coefficients; Group coeffGroup = g.getGroup(0, 0).getGroup(3, 0); final int amountOfCoefficients = coeffGroup.getFieldRepetitionCount(0); coefficients = new DenseVector(amountOfCoefficients); for (int j = 0; j < amountOfCoefficients; j++) { double coefficient = coeffGroup.getGroup(0, j).getDouble(0, 0); coefficients.set(j, coefficient); } return coefficients; }
protected static void writeMapData(ParquetWriter<Group> writer, boolean isDictionaryEncoding, int elementNum) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int mapMaxSize = 4; int mapElementIndex = 0; for (int i = 0; i < elementNum; i++) { boolean isNull = isNull(i); Group group = f.newGroup();
protected static void writeData(ParquetWriter<Group> writer, boolean isDictionaryEncoding) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); for (int i = 0; i < nElements; i++) { boolean isNull = isNull(i); boolean booleanVal = getBooleanValue(i); Binary binary = getBinaryValue(isDictionaryEncoding, i); Group group = f.newGroup() .append("int32_field", intVal) .append("int64_field", longVal)