public long getNumRows() throws DDFException { return this.getMetaDataHandler().getNumRows(); }
public DDF binning(String column, String binningType, int numBins, double[] breaks, boolean includeLowest, boolean right) throws DDFException { DDF newddf = binningImpl(column, binningType, numBins, breaks, includeLowest, right); newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf; }
@Override public DDF transformScaleMinMax() throws DDFException { Summary[] summaryArr = this.getDDF().getSummary(); List<Column> columns = this.getDDF().getSchema().getColumns(); // Compose a transformation query StringBuffer sqlCmdBuffer = new StringBuffer("SELECT "); for (int i = 0; i < columns.size(); i++) { Column col = columns.get(i); if (!col.isNumeric() || col.getColumnClass() == ColumnClass.FACTOR) { sqlCmdBuffer.append(col.getName()).append(" "); } else { // subtract min, divide by (max - min) sqlCmdBuffer.append(String.format("((%s - %s) / %s) as %s ", col.getName(), summaryArr[i].min(), (summaryArr[i].max() - summaryArr[i].min()), col.getName())); } sqlCmdBuffer.append(","); } sqlCmdBuffer.setLength(sqlCmdBuffer.length() - 1); sqlCmdBuffer.append("FROM ").append(this.getDDF().getTableName()); DDF newddf = this.getManager().sql2ddf(sqlCmdBuffer.toString(), false); newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf; }
@Override public DDF subset(List<Column> columnExpr, Expression filter) throws DDFException { DDF subset = _subset(columnExpr, filter); subset.getMetaDataHandler().copyFactor(this.getDDF(), this.getDDF().getColumnNames()); return subset; }
@Override public DDF transformScaleStandard() throws DDFException { Summary[] summaryArr = this.getDDF().getSummary(); List<Column> columns = this.getDDF().getSchema().getColumns(); // Compose a transformation query StringBuffer sqlCmdBuffer = new StringBuffer("SELECT "); for (int i = 0; i < columns.size(); i++) { Column col = columns.get(i); if (!col.isNumeric() || col.getColumnClass() == ColumnClass.FACTOR) { sqlCmdBuffer.append(col.getName()); } else { // subtract mean, divide by stdev sqlCmdBuffer.append(String.format("((%s - %s) / %s) as %s ", col.getName(), summaryArr[i].mean(), summaryArr[i].stdev(), col.getName())); } sqlCmdBuffer.append(","); } sqlCmdBuffer.setLength(sqlCmdBuffer.length() - 1); sqlCmdBuffer.append("FROM ").append(this.getDDF().getTableName()); DDF newddf = this.getManager().sql2ddf(sqlCmdBuffer.toString(), false); newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf; }
@Override public DDF removeColumns(List<String> columnNames) throws DDFException { if (columnNames == null || columnNames.isEmpty()) throw new DDFException("columnNames must be specified"); List<String> currentColumnNames = this.getDDF().getColumnNames(); for(String columnName: columnNames) { if(!currentColumnNames.contains(columnName)) { throw new DDFException(String.format("Column %s does not exists", columnName)); } } List<String> columns = this.getDDF().getColumnNames(); for (String columnName : columnNames) { for (Iterator<String> it = columns.iterator();it.hasNext();) { if (it.next().equals(columnName)) { it.remove(); } } } DDF newddf = this.project(columns); if(this.getDDF().isMutable()) { this.getDDF().updateInplace(newddf); return this.getDDF(); } else { newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf; } }
@Override public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException { SparkDDFManager sparkDDFManager = (SparkDDFManager)mDDFManager; HiveContext sqlContext = sparkDDFManager.getHiveContext(); JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials)dataSource.getDataSourceCredentials(); String fullURL = dataSource.getDataSourceUri().getUri().toString(); if (cred.getUsername() != null && !cred.getUsername().equals("")) { fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword()); } Map<String, String> options = new HashMap<String, String>(); options.put("url", fullURL); options.put("dbtable", dataSource.getDbTable()); DataFrame df = sqlContext.load("jdbc", options); DDF ddf = sparkDDFManager.newDDF(sparkDDFManager, df, new Class<?>[]{DataFrame.class}, null, SparkUtils.schemaFromDataFrame(df)); // TODO? ddf.getRepresentationHandler().get(RDD.class, Row.class); ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource); return ddf; }
@Override public DDF copy() throws DDFException { DDF newDDF = this.sql2ddf("select * from @this"); newDDF.getMetaDataHandler().copy(this.getMetaDataHandler()); newDDF.getPersistenceHandler().setPersistable(this.getPersistenceHandler().isPersistable()); return newDDF; }
/** * This function fills NA with given values. Default using a scalar value fillNA(value,null, 0, null, null, null, * false) * * @param value a scalar value to fill all NAs * @param method = 'ffill' for forward fill or 'bfill' for backward fill * @param limit = maximum size gap for forward or backward fill * @param function aggregate function to generate the filled value for a column * @param columnsToValues = a map to provide different values to fill for different columns * @param columns = only consider NA filling on the given columns, set to null for all columns of the DDF * @return a DDF with NAs filled */ @Override public DDF fillNA(String value, FillMethod method, long limit, AggregateFunction function, Map<String, String> columnsToValues, List<String> columns) throws DDFException { DDF newddf = null; if (columns == null) { columns = this.getDDF().getColumnNames(); } if (method == null) { String sqlCmd = fillNAWithValueSQL(value, function, columnsToValues, columns); mLog.info("FillNA sql command: " + sqlCmd); newddf = this.getManager().sql2ddf(String.format(sqlCmd, this.getDDF().getTableName()), false); } else { // interpolation methods 'ffill' or 'bfill' // TODO: } newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf; }
public DDF transformUDF(List<String> RExps, List<String> columns) throws DDFException { String sqlCmd = String.format("SELECT %s FROM %s", RToSqlUdf(RExps, columns, this.getDDF().getSchema().getColumns()), "{1}"); DDF newddf = this.getManager().sql2ddf(sqlCmd, new SQLDataSourceDescriptor(sqlCmd, null, null, null, this.getDDF() .getUUID().toString())); if (this.getDDF().isMutable()) { return this.getDDF().updateInplace(newddf); } else { newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf; } }
@Override public DDF updateInplace(DDF newddf) throws DDFException { //copy content of newddf to this ddf DDF curDDF = this.getDDF(); curDDF.getRepresentationHandler().reset(); curDDF.getRepresentationHandler().setRepresentations(newddf.getRepresentationHandler().getAllRepresentations()); newddf.getMetaDataHandler().copyFactor(this.getDDF()); curDDF.getSchemaHandler().setSchema(newddf.getSchema()); return curDDF; } }
newddf.getMetaDataHandler().copyFactor(this.getDDF()); return newddf;