+ " ORDER BY col1+col2, col4"); List<Row> rows = dataset.collectAsList(); ResultSet rs = new SparkResultSet(rows, dataset.columns()); assertTrue(rs.next()); assertEquals("a", rs.getString(3));
sqlContext.sql(query); List<Row> rows = dataset.collectAsList(); ResultSet rs = new SparkResultSet(rows, dataset.columns()); assertTrue(rs.next()); assertEquals("bb",rs.getString(1));
private String getRefFieldName(Map<String, Dataset<Row>> dependencies) { return refFieldName == null ? dependencies.get(refStepName).columns()[0] : refFieldName; }
private String getFieldName(Map<String, Dataset<Row>> dependencies) { return fieldName == null ? dependencies.get(getStepName(dependencies)).columns()[0] : fieldName; }
public static ResultSet executeQuery(Connection conn, QueryBuilder queryBuilder, String url, Configuration config) throws SQLException { SQLContext sqlContext = getSparkSession().sqlContext(); boolean forceRowKeyOrder = conn.unwrap(PhoenixConnection.class).getQueryServices().getProps() .getBoolean(QueryServices.FORCE_ROW_KEY_ORDER_ATTRIB, false); // if we are forcing row key order we have to add an ORDER BY // here we assume that the required columns are in the primary key column order String prevOrderBy = queryBuilder.getOrderByClause(); if (forceRowKeyOrder && (queryBuilder.getOrderByClause()==null || queryBuilder.getOrderByClause().isEmpty())) { queryBuilder.setOrderByClause(Joiner.on(", ").join(queryBuilder.getRequiredColumns())); } // create PhoenixRDD using the table name and columns that are required by the query // since we don't set the predicate filtering is done after rows are returned from spark Dataset phoenixDataSet = getSparkSession().read().format("phoenix") .option(DataSourceOptions.TABLE_KEY, queryBuilder.getFullTableName()) .option(PhoenixDataSource.ZOOKEEPER_URL, url).load(); phoenixDataSet.createOrReplaceTempView(queryBuilder.getFullTableName()); Dataset<Row> dataset = sqlContext.sql(queryBuilder.build()); SparkPlan plan = dataset.queryExecution().executedPlan(); List<Row> rows = dataset.collectAsList(); queryBuilder.setOrderByClause(prevOrderBy); ResultSet rs = new SparkResultSet(rows, dataset.columns()); return rs; } }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { dependencyCheck(dependencies); Dataset<Row> sourceStep = dependencies.get(stepName); if (useIncludeFields){ if (!Arrays.asList(sourceStep.columns()).containsAll(includeFields)){ throw new RuntimeException("Columns specified in " + INCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } String firstCol = includeFields.get(0); includeFields.remove(0); return sourceStep.select(firstCol, includeFields.toArray(new String[0])); } else { if (!Arrays.asList(sourceStep.columns()).containsAll(excludeFields)){ throw new RuntimeException("Columns specified in " + EXCLUDE_FIELDS + " are not found in input dependency schema \n" + "Available columns: " + Arrays.toString(sourceStep.columns())); } return sourceStep.drop(JavaConverters.collectionAsScalaIterableConverter(excludeFields).asScala().toSeq()); } }
/** * Carry out a union of two {@link Dataset}s where the input * Datasets may contain a different number of columns. * The resulting Dataset will contain entries for all of the columns found in * the input Dataset, with null entries used as placeholders. * * @param ds1 the first Dataset * @param ds2 the second Dataset * @return the combined Dataset */ public static Dataset<Row> union(final Dataset<Row> ds1, final Dataset<Row> ds2) { Set<String> ds1Cols = Sets.newHashSet(ds1.columns()); Set<String> ds2Cols = Sets.newHashSet(ds2.columns()); final Set<String> total = Sets.newHashSet(ds1Cols); total.addAll(ds2Cols); return ds1.select(expr(ds1Cols, total)).union(ds2.select(expr(ds2Cols, total))); }
/** * Normalize by zero mean unit variance * * @param frame the data to normalize * @return a zero mean unit variance centered * rdd */ public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(frame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is std second row is mean, each column in a row is for a particular column List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double std = ((Number) stdDevMean.get(0).get(i)).doubleValue(); double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue(); if (std == 0.0) std = 1; //All same value -> (x-x)/1 = 0 frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std))); } return frame; }
/** * Normalize by zero mean unit variance * * @param frame the data to normalize * @return a zero mean unit variance centered * rdd */ public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(frame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is std second row is mean, each column in a row is for a particular column List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double std = ((Number) stdDevMean.get(0).get(i)).doubleValue(); double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue(); if (std == 0.0) std = 1; //All same value -> (x-x)/1 = 0 frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std))); } return frame; }
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }
/** * Scale based on min,max * * @param dataFrame the dataframe to scale * @param min the minimum value * @param max the maximum value * @return the normalized dataframe per column */ public static DataRowsFacade normalize(DataRowsFacade dataFrame, double min, double max, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(dataFrame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is min second row is max, each column in a row is for a particular column List<Row> minMax = minMaxColumns(dataFrame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double dMin = ((Number) minMax.get(0).get(i)).doubleValue(); double dMax = ((Number) minMax.get(1).get(i)).doubleValue(); double maxSubMin = (dMax - dMin); if (maxSubMin == 0) maxSubMin = 1; Column newCol = dataFrame.get().col(columnName).minus(dMin).divide(maxSubMin).multiply(max - min).plus(min); dataFrame = dataRows(dataFrame.get().withColumn(columnName, newCol)); } return dataFrame; }
@Test public void fitAndTransform() { KMeans kmeans = new KMeans().setK(k).setSeed(1); KMeansModel model = kmeans.fit(dataset); Vector[] centers = model.clusterCenters(); assertEquals(k, centers.length); Dataset<Row> transformed = model.transform(dataset); List<String> columns = Arrays.asList(transformed.columns()); List<String> expectedColumns = Arrays.asList("features", "prediction"); for (String column : expectedColumns) { assertTrue(columns.contains(column)); } } }
@Test public void fitAndTransform() { KMeans kmeans = new KMeans().setK(k).setSeed(1); KMeansModel model = kmeans.fit(dataset); Vector[] centers = model.clusterCenters(); assertEquals(k, centers.length); Dataset<Row> transformed = model.transform(dataset); List<String> columns = Arrays.asList(transformed.columns()); List<String> expectedColumns = Arrays.asList("features", "prediction"); for (String column : expectedColumns) { assertTrue(columns.contains(column)); } } }
@Test public void fitAndTransform() { KMeans kmeans = new KMeans().setK(k).setSeed(1); KMeansModel model = kmeans.fit(dataset); Vector[] centers = model.clusterCenters(); assertEquals(k, centers.length); Dataset<Row> transformed = model.transform(dataset); List<String> columns = Arrays.asList(transformed.columns()); List<String> expectedColumns = Arrays.asList("features", "prediction"); for (String column : expectedColumns) { assertTrue(columns.contains(column)); } } }
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
private void start() { SparkSession spark = SparkSession.builder() .appName("CSV to Dataset") .master("local") .getOrCreate(); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "false") .load(filename); df.show(); // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x int count = df.columns().length; for (int i = 0; i < count; i++) { String oldColName = "_c" + i; String newColName = "C" + i; df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName); } df.show(); } }