SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQL"); JavaSparkContext ctx = new JavaSparkContext(sparkConf); SQLContext sqlContext = new SQLContext(ctx); Dataset<Row> schemaPeople = sqlContext.createDataFrame(people, Person.class); schemaPeople.registerTempTable("people"); Dataset<Row> teenagers = sqlContext.sql("SELECT name FROM people WHERE country = 'USA' AND age >= 13 AND age <= 19"); schemaPeople.write().parquet("people.parquet"); Dataset<Row> parquetFile = sqlContext.read().parquet("people.parquet"); sqlContext.sql("SELECT name FROM parquetFile WHERE age >= 13 AND age <= 19"); teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { @Override Dataset<Row> peopleFromJsonFile = sqlContext.read().json(path); Dataset<Row> teenagers3 = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19"); JavaRDD<String> anotherPeopleRDD = ctx.parallelize(jsonData); Dataset<Row> peopleFromJsonRDD = sqlContext.read().json(anotherPeopleRDD.rdd()); Dataset<Row> peopleWithCity = sqlContext.sql("SELECT name, address.city FROM people2"); List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { @Override
public static void main(String[] args) { String inputFile = args[0]; SparkConf conf = new SparkConf(); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlCtx = new SQLContext(sc); DataFrame input = sqlCtx.jsonFile(inputFile); input.printSchema(); input.registerTempTable("tweets"); DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10"); Row[] result = topTweets.collect(); for (Row row : result) { System.out.println(row.get(0)); JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList); DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class); happyPeopleSchemaRDD.registerTempTable("happy_people"); sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() { @Override public Integer call(String str) throws Exception { DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10"); Row[] lengths = tweetLength.collect(); for (Row row : result) { System.out.println(row.get(0)); sc.stop();
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage LoadJsonWithSparkSQL sparkMaster jsonFile"); } String master = args[0]; String jsonFile = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "loadJsonwithsparksql"); SQLContext sqlCtx = new SQLContext(sc); DataFrame input = sqlCtx.jsonFile(jsonFile); input.printSchema(); } }
public static void main(String[] args) throws Exception { if (args.length != 3) { throw new Exception("Usage LoadHive sparkMaster tbl"); } String master = args[0]; String tbl = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS")); SQLContext sqlCtx = new SQLContext(sc); DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src"); JavaRDD<Integer> squaredKeys = rdd.toJavaRDD().map(new SquareKey()); List<Integer> result = squaredKeys.collect(); for (Integer elem : result) { System.out.println(elem); } } }
public static void main(String[] args) throws Exception { String arffFile = "datasets/simulated/syntheticData.arff"; String sparkFile = "datasets/simulated/syntheticData.json"; SparkConf conf = new SparkConf().setAppName("SparkLink!").setMaster("local"); SparkContext sc = new SparkContext(conf); SQLContext sqlContext = new SQLContext(sc); JavaSparkContext jsc = new JavaSparkContext(sc); ARFFtoSparkFormat(arffFile, sparkFile, "json", sqlContext, jsc); }
public void compact(String inputPath, String outputPath) throws IOException { this.setCompressionAndSerializationOptions(inputPath, outputPath); this.outputCompressionProperties(this.outputCompression); // Defining Spark Context with a generic Spark Configuration. SparkConf sparkConf = new SparkConf().setAppName("Spark Compaction"); JavaSparkContext sc = new JavaSparkContext(sparkConf); if (this.outputSerialization.equals(TEXT)) { JavaRDD<String> textFile = sc.textFile(this.concatInputPath(inputPath)); textFile.coalesce(this.splitSize).saveAsTextFile(outputPath); } else if (this.outputSerialization.equals(PARQUET)) { SQLContext sqlContext = new SQLContext(sc); DataFrame parquetFile = sqlContext.read().parquet(this.concatInputPath(inputPath)); parquetFile.coalesce(this.splitSize).write().parquet(outputPath); } else if (this.outputSerialization.equals(AVRO)) { // For this to work the files must end in .avro // Another issue is that when using compression the compression codec extension is not being added to the file name. SQLContext sqlContext = new SQLContext(sc); DataFrame avroFile = sqlContext.read().format("com.databricks.spark.avro").load(this.concatInputPath(inputPath)); avroFile.coalesce(this.splitSize).write().format("com.databricks.spark.avro").save(outputPath); } else { System.out.println("Did not match any serialization type: text, parquet, or avro. Recieved: " + this.outputSerialization); } }
@Override public ExperimentResults runCrossValidation(DataMap data, int folds, long randSeed) { JavaSparkContext context = new JavaSparkContext( new SparkConf().setAppName("JStylo Spark Classifier").setMaster(sparkmaster)); SQLContext sql = new SQLContext(context); Map<String,Double> labels = SparkUtils.getLabelMap(data); documentTitles = null; DataFrame[] splits = df.randomSplit(splitArray,randSeed); for (int i = 0; i<folds; i++){ DataFrame train = df.except(test); ExperimentResults foldIResults = classify(sql,labels,train,test); context.close(); return er;
/** * Creates a data frame from a collection of writables * rdd given a schema * * @param schema the schema to use * @param data the data to convert * @return the dataframe object */ public static DataRowsFacade toDataFrame(Schema schema, JavaRDD<List<Writable>> data) { JavaSparkContext sc = new JavaSparkContext(data.context()); SQLContext sqlContext = new SQLContext(sc); JavaRDD<Row> rows = data.map(new ToRow(schema)); return dataRows(sqlContext.createDataFrame(rows, fromSchema(schema))); }
@Before public void setUp() throws IOException { sqlContext = TestHive$.MODULE$; sc = new JavaSparkContext(sqlContext.sparkContext()); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); if (path.exists()) { path.delete(); } HiveSessionCatalog catalog = (HiveSessionCatalog) sqlContext.sessionState().catalog(); hiveManagedPath = new Path(catalog.defaultTablePath(new TableIdentifier("javaSavedTable"))); fs = hiveManagedPath.getFileSystem(sc.hadoopConfiguration()); fs.delete(hiveManagedPath, true); List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } Dataset<String> ds = sqlContext.createDataset(jsonObjects, Encoders.STRING()); df = sqlContext.read().json(ds); df.createOrReplaceTempView("jsonTable"); }
Dataset phoenixDataSet = SparkUtil.getSparkSession().read().format("phoenix") .option(DataSourceOptions.TABLE_KEY, tableName) .option(PhoenixDataSource.ZOOKEEPER_URL, getUrl()).load(); phoenixDataSet.createOrReplaceTempView(tableName); Dataset<Row> dataset = sqlContext.sql("SELECT col1+col2, col4, a_string FROM " + tableName + " ORDER BY col1+col2, col4"); List<Row> rows = dataset.collectAsList(); ResultSet rs = new SparkResultSet(rows, dataset.columns()); assertTrue(rs.next()); assertEquals("a", rs.getString(3));
@Before public void init() throws IOException { // Initialize a local spark env jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable")); // Create a temp folder as the base path TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); jsc.hadoopConfiguration().addResource(dfs.getConf()); dfs.mkdirs(new Path(basePath)); HoodieTestUtils.initTableType(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ); sqlContext = new SQLContext(jsc); // SQLContext stuff }
SparkConf sf = new SparkConf().setAppName("name").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(sf); SQLContext sqlCon = new SQLContext(sc); Map map = new HashMap<String, Map<String, String>>(); map.put("test1", putMap); HashMap putMap = new HashMap<String, String>(); putMap.put("1", "test"); List<Tuple2<String, HashMap>> list = new ArrayList<Tuple2<String, HashMap>>(); Set<String> allKeys = map.keySet(); for (String key : allKeys) { list.add(new Tuple2<String, HashMap>(key, (HashMap) map.get(key))); }; JavaRDD<Tuple2<String, HashMap>> rdd = sc.parallelize(list); System.out.println(rdd.first()); List<StructField> fields = new ArrayList<>(); StructField field1 = DataTypes.createStructField("String", DataTypes.StringType, true); StructField field2 = DataTypes.createStructField("Map", DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), true); fields.add(field1); fields.add(field2); StructType struct = DataTypes.createStructType(fields); JavaRDD<Row> rowRDD = rdd.map(new Function<Tuple2<String, HashMap>, Row>() { @Override public Row call(Tuple2<String, HashMap> arg0) throws Exception { return RowFactory.create(arg0._1, arg0._2); } }); DataFrame df = sqlCon.createDataFrame(rowRDD, struct); df.show();
import org.apache.spark.api.java.*; import org.apache.spark.SparkConf; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.hive.HiveContext; import java.util.*; import org.apache.spark.sql.DataFrame; import static org.apache.spark.sql.functions.*; public class App { public static void main(String[] args) { JavaSparkContext sc = new JavaSparkContext(new SparkConf()); SQLContext sqlContext = new HiveContext(sc); List<String> data = Arrays.asList( "{\"id\": 1, \"vs\": [\"a\", \"b\"]}", "{\"id\": 1, \"vs\": [\"c\", \"d\"]}", "{\"id\": 2, \"vs\": [\"e\", \"f\"]}", "{\"id\": 2, \"vs\": [\"g\", \"h\"]}" ); DataFrame df = sqlContext.read().json(sc.parallelize(data)); df.withColumn("vs", explode(col("vs"))) .groupBy(col("id")) .agg(collect_list(col("vs"))) .show(); } }
@Override public RDD<Row> buildScan() { log.debug("-> buildScan()"); // I have isolated the work to a method to keep the plumbing code as simple // as // possible. List<List<Integer>> table = collectData(); @SuppressWarnings("resource") // cannot be closed here, done elsewhere JavaSparkContext sparkContext = new JavaSparkContext(sqlContext .sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(table) .map(row -> RowFactory.create(row.toArray())); return rowRDD.rdd(); }
import org.apache.spark.api.java.*; import org.apache.spark.SparkConf; import org.apache.spark.sql.SQLContext; import static org.apache.spark.sql.functions.*; import org.apache.spark.sql.DataFrame; public class App { public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext= new SQLContext(sc); DataFrame df = sqlContext.sql( "SELECT CAST('2012-01-01' AS DATE), CAST('2013-08-02' AS DATE)").toDF("first", "second"); df.select(datediff(df.col("first"), df.col("second"))).show(); } }
public class TestClass { JavaSparkContext sc = new JavaSparkContext("local[*]", "test spark-mongodb java"); SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc); Map options = new HashMap(); options.put("host", "localhost:27017"); options.put("database", "test"); options.put("collection", "mycol"); DataFrame df = sqlContext.read().format("com.stratio.datasource.mongodb").options(options).load(); df.registerTempTable("mycol"); sqlContext.sql("SELECT * FROM mycol"); df.show(); }
/** * Converts a dependency graph to a data frame. * @param jsc * @param graph * @param featureFrame * @return a data frame */ public DataFrame toDataFrame(JavaSparkContext jsc, DependencyGraph graph, FeatureFrame featureFrame) { List<ParsingContext> list = TransitionDecoder.decode(graph, featureFrame); JavaRDD<ParsingContext> javaRDD = jsc.parallelize(list); return sqlContext.createDataFrame(javaRDD, ParsingContext.class); }
@Test public void saveTableAndQueryIt() { checkAnswer( df.select(avg("key").over( Window.partitionBy("value").orderBy("key").rowsBetween(-1, 1))), hc.sql("SELECT avg(key) " + "OVER (PARTITION BY value " + " ORDER BY key " + " ROWS BETWEEN 1 preceding and 1 following) " + "FROM window_table").collectAsList()); }
@Test public void saveTableAndQueryIt() { Map<String, String> options = new HashMap<>(); df.write() .format("org.apache.spark.sql.json") .mode(SaveMode.Append) .options(options) .saveAsTable("javaSavedTable"); checkAnswer( sqlContext.sql("SELECT * FROM javaSavedTable"), df.collectAsList()); } }
/** * Given a bunch of hoodie keys, fetches all the individual records out as a data frame * * @return a dataframe */ public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism) throws Exception { assertSqlContext(); JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD = index .fetchRecordLocation(hoodieKeys, jsc, hoodieTable); List<String> paths = keyToFileRDD.filter(keyFileTuple -> keyFileTuple._2().isPresent()) .map(keyFileTuple -> keyFileTuple._2().get()).collect(); // record locations might be same for multiple keys, so need a unique list Set<String> uniquePaths = new HashSet<>(paths); Dataset<Row> originalDF = sqlContextOpt.get().read() .parquet(uniquePaths.toArray(new String[uniquePaths.size()])); StructType schema = originalDF.schema(); JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD().mapToPair(row -> { HoodieKey key = new HoodieKey(row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); // Now, we need to further filter out, for only rows that match the supplied hoodie keys JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism).map(tuple -> tuple._2()._1()); return sqlContextOpt.get().createDataFrame(rowRDD, schema); }