/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }
df.select(col("name"), col("age").plus(1)).show();