public static void main(String[] args) throws Exception { if (args.length != 3) { throw new Exception("Usage LoadHive sparkMaster tbl"); } String master = args[0]; String tbl = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS")); SQLContext sqlCtx = new SQLContext(sc); DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src"); JavaRDD<Integer> squaredKeys = rdd.toJavaRDD().map(new SquareKey()); List<Integer> result = squaredKeys.collect(); for (Integer elem : result) { System.out.println(elem); } } }
DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10"); Row[] result = topTweets.collect(); for (Row row : result) { DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10"); Row[] lengths = tweetLength.collect(); for (Row row : result) {
@Override public void saveDataFrame(DataFrame dataFrame, Class<?> entityClazz, Map<String, Object> properties) { dataFrame.sqlContext().sql("use " + (String) properties.get(KEYSPACE)); dataFrame.write().insertInto((String) properties.get(TABLE)); } }
/** * Executes a SparkSQL query over the configured JavaSQLContext. * @param query SparkSQL query. * @return A JavaSchemaRDD containing the result of the executed query. */ public DataFrame sql(String query) { return sqlContext.sql(query); }
/** * Executes a SparkSQL query over the configured JavaSQLContext. * @param query SparkSQL query. * @return A JavaSchemaRDD containing the result of the executed query. */ public DataFrame sql(String query) { return sqlContext.sql(query); }
phoenixDataSet.createOrReplaceTempView(tableName); Dataset<Row> dataset = sqlContext.sql("SELECT col1+col2, col4, a_string FROM " + tableName + " ORDER BY col1+col2, col4"); List<Row> rows = dataset.collectAsList();
Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19"); sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { @Override Dataset<Row> teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); Dataset<Row> peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { @Override
Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19"); sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); Dataset<Row> teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); Dataset<Row> peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0) + ", City: " + row.getString(1)).collect();
+ tableName2 + " order by `cf2.d`"; Dataset<Row> dataset = sqlContext.sql(query); List<Row> rows = dataset.collectAsList(); ResultSet rs = new SparkResultSet(rows, dataset.columns());
+ " T2 ON T1.A_STRING = T2.A_STRING ORDER BY T1.`CF1.B`"; Dataset<Row> dataset = sqlContext.sql(query); List<Row> rows = dataset.collectAsList(); ResultSet rs = new SparkResultSet(rows, dataset.columns()); "SELECT T1.A_STRING, T2.COL1 FROM " + tableName1 + " T1 JOIN " + tableName2 + " T2 ON T1.A_STRING = T2.A_STRING ORDER BY T2.COL1"; dataset = sqlContext.sql(query); rows = dataset.collectAsList(); rs = new SparkResultSet(rows, dataset.columns());
phoenixDataSet.createOrReplaceTempView(tableName); Dataset<Row> dataset = sqlContext.sql("SELECT A_STRING, `CF1.A`, `CF1.B`, COL1, `CF2.C`, `CF2.D`, COL2 from " + tableName + " ORDER BY `CF1.A`,`CF2.C`"); List<Row> rows = dataset.collectAsList(); sqlContext.sql("SELECT A_STRING, `CF1.A`, `CF1.B`, COL1, `CF2.C`, `CF2.D`, COL2 from " + tableName + " ORDER BY COL2"); rows = dataset.collectAsList();
public static ResultSet executeQuery(Connection conn, QueryBuilder queryBuilder, String url, Configuration config) throws SQLException { SQLContext sqlContext = getSparkSession().sqlContext(); boolean forceRowKeyOrder = conn.unwrap(PhoenixConnection.class).getQueryServices().getProps() .getBoolean(QueryServices.FORCE_ROW_KEY_ORDER_ATTRIB, false); // if we are forcing row key order we have to add an ORDER BY // here we assume that the required columns are in the primary key column order String prevOrderBy = queryBuilder.getOrderByClause(); if (forceRowKeyOrder && (queryBuilder.getOrderByClause()==null || queryBuilder.getOrderByClause().isEmpty())) { queryBuilder.setOrderByClause(Joiner.on(", ").join(queryBuilder.getRequiredColumns())); } // create PhoenixRDD using the table name and columns that are required by the query // since we don't set the predicate filtering is done after rows are returned from spark Dataset phoenixDataSet = getSparkSession().read().format("phoenix") .option(DataSourceOptions.TABLE_KEY, queryBuilder.getFullTableName()) .option(PhoenixDataSource.ZOOKEEPER_URL, url).load(); phoenixDataSet.createOrReplaceTempView(queryBuilder.getFullTableName()); Dataset<Row> dataset = sqlContext.sql(queryBuilder.build()); SparkPlan plan = dataset.queryExecution().executedPlan(); List<Row> rows = dataset.collectAsList(); queryBuilder.setOrderByClause(prevOrderBy); ResultSet rs = new SparkResultSet(rows, dataset.columns()); return rs; } }
@After public void tearDown() throws IOException { // Clean up tables. if (hc != null) { hc.sql("DROP TABLE IF EXISTS window_table"); } }
@After public void tearDown() throws IOException { // Clean up tables. if (hc != null) { hc.sql("DROP TABLE IF EXISTS window_table"); } }
@Test public void deepSparkContextSQL() { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); String query = "SELECT * FROM input"; deepSparkContextSpy.sql(query); verify(sqlContext).sql(query); }
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
@Test public void saveTableAndQueryIt() { checkAnswer( df.select(avg("key").over( Window.partitionBy("value").orderBy("key").rowsBetween(-1, 1))), hc.sql("SELECT avg(key) " + "OVER (PARTITION BY value " + " ORDER BY key " + " ROWS BETWEEN 1 preceding and 1 following) " + "FROM window_table").collectAsList()); }
@Test public void saveTableAndQueryIt() { checkAnswer( df.select(avg("key").over( Window.partitionBy("value").orderBy("key").rowsBetween(-1, 1))), hc.sql("SELECT avg(key) " + "OVER (PARTITION BY value " + " ORDER BY key " + " ROWS BETWEEN 1 preceding and 1 following) " + "FROM window_table").collectAsList()); }