/** * Normalize by zero mean unit variance * * @param frame the data to normalize * @return a zero mean unit variance centered * rdd */ public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(frame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is std second row is mean, each column in a row is for a particular column List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double std = ((Number) stdDevMean.get(0).get(i)).doubleValue(); double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue(); if (std == 0.0) std = 1; //All same value -> (x-x)/1 = 0 frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std))); } return frame; }
/** * Normalize by zero mean unit variance * * @param frame the data to normalize * @return a zero mean unit variance centered * rdd */ public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(frame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is std second row is mean, each column in a row is for a particular column List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double std = ((Number) stdDevMean.get(0).get(i)).doubleValue(); double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue(); if (std == 0.0) std = 1; //All same value -> (x-x)/1 = 0 frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std))); } return frame; }
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }