@Override public boolean persist(List listEntity, EntityMetadata m, SparkClient sparkClient) { try { Seq s = scala.collection.JavaConversions.asScalaBuffer(listEntity).toList(); ClassTag tag = scala.reflect.ClassTag$.MODULE$.apply(m.getEntityClazz()); JavaRDD personRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD(); DataFrame df = sparkClient.sqlContext.createDataFrame(personRDD, m.getEntityClazz()); sparkClient.sqlContext.sql("use " + m.getSchema()); if (logger.isDebugEnabled()) { logger.info("Below are the registered table with hive context: "); sparkClient.sqlContext.sql("show tables").show(); } df.write().insertInto(m.getTableName()); return true; } catch (Exception e) { throw new KunderaException("Cannot persist object(s)", e); } }
DataFrame df2 = df.groupBy("Column_one", "Column_two").count(); df2.show();
List<String> stringAsList = new ArrayList<String>(); stringAsList.add("buzz"); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((String row) -> { return RowFactory.create(row); }); StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("fizz", DataTypes.StringType, false) }); DataFrame df = sqlContext.createDataFrame(rowRDD, schema).toDF(); df.show(); //+----+ |fizz| +----+ |buzz|
results.show();
SparkConf sf = new SparkConf().setAppName("name").setMaster("local[*]"); JavaSparkContext sc = new JavaSparkContext(sf); SQLContext sqlCon = new SQLContext(sc); Map map = new HashMap<String, Map<String, String>>(); map.put("test1", putMap); HashMap putMap = new HashMap<String, String>(); putMap.put("1", "test"); List<Tuple2<String, HashMap>> list = new ArrayList<Tuple2<String, HashMap>>(); Set<String> allKeys = map.keySet(); for (String key : allKeys) { list.add(new Tuple2<String, HashMap>(key, (HashMap) map.get(key))); }; JavaRDD<Tuple2<String, HashMap>> rdd = sc.parallelize(list); System.out.println(rdd.first()); List<StructField> fields = new ArrayList<>(); StructField field1 = DataTypes.createStructField("String", DataTypes.StringType, true); StructField field2 = DataTypes.createStructField("Map", DataTypes.createMapType(DataTypes.StringType, DataTypes.StringType), true); fields.add(field1); fields.add(field2); StructType struct = DataTypes.createStructType(fields); JavaRDD<Row> rowRDD = rdd.map(new Function<Tuple2<String, HashMap>, Row>() { @Override public Row call(Tuple2<String, HashMap> arg0) throws Exception { return RowFactory.create(arg0._1, arg0._2); } }); DataFrame df = sqlCon.createDataFrame(rowRDD, struct); df.show();
public static void main( String[] args ) { // SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("local[2]"); // SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://Kavithas-MBP.home:7077"); SparkConf conf = new SparkConf().setAppName("App-mt").setMaster("spark://kavithas-mbp.watson.ibm.com:7077"); JavaSparkContext sc = new JavaSparkContext(conf); HiveContext sqlContext = new HiveContext(sc.sc()); DataFrame urls = sqlContext.read().json("/tmp/urls.json"); urls.registerTempTable("urls"); DataFrame temp = sqlContext.sql("select * from urls"); temp.show(); sqlContext.sql("add jar /tmp/quetzal.jar"); sqlContext.sql("create temporary function webservice as 'com.ibm.research.rdf.store.utilities.WebServiceGetUDTF'"); DataFrame drugs = sqlContext.sql("select webservice(\"drug,id,action\", \"url\", \"\", \"GET\", \"xs=http://www.w3.org/2001/XMLSchema\", \"//row\",\"drug\",\"./drug\"," + " \"<string>\", \"id\", \"./id\",\"<string>\", \"action\", \"./action\", \"<string>\", url) as (drug, drug_typ, id, id_typ, action, action_typ) from urls"); drugs.show(); System.out.println("Num rows:" + drugs.count()); }
trainingData.show(false);
/** * Creates a n-gram data frame from text lines. * @param lines * @return a n-gram data frame. */ DataFrame createNGramDataFrame(JavaRDD<String> lines) { JavaRDD<Row> rows = lines.map(new Function<String, Row>(){ private static final long serialVersionUID = -4332903997027358601L; @Override public Row call(String line) throws Exception { return RowFactory.create(Arrays.asList(line.split("\\s+"))); } }); StructType schema = new StructType(new StructField[] { new StructField("words", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); DataFrame wordDF = new SQLContext(jsc).createDataFrame(rows, schema); // build a bigram language model NGram transformer = new NGram().setInputCol("words") .setOutputCol("ngrams").setN(2); DataFrame ngramDF = transformer.transform(wordDF); ngramDF.show(10, false); return ngramDF; }