private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
@Override public String getSparkUiWebUrl() { return getOrCreate().sparkContext().uiWebUrl().get(); }
@Override public TryResult configure(KernelFunctionality kernel, SparkUIApi sparkUI, Message parentMessage) { SparkConf sparkConf = createSparkConf(sparkUI.getAdvancedOptions(), getSparkConfBasedOn(this.sparkSessionBuilder)); sparkConf = configureSparkConf(sparkConf, sparkUI); this.sparkSessionBuilder = SparkSession.builder().config(sparkConf); if (sparkUI.getHiveSupport()) { this.sparkSessionBuilder.enableHiveSupport(); } TryResult sparkSessionTry = createSparkSession(sparkUI, parentMessage); if (sparkSessionTry.isError()) { return sparkSessionTry; } addListener(getOrCreate().sparkContext(), sparkUI); SparkVariable.putSparkSession(getOrCreate()); TryResult tryResultSparkContext = initSparkContextInShell(kernel, parentMessage); if (!tryResultSparkContext.isError()) { kernel.registerCancelHook(SparkVariable::cancelAllJobs); } return tryResultSparkContext; }
.builder() .appName("SparkSQLRelativeFrequency") .config(sparkConf) .getOrCreate(); JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); int neighborWindow = Integer.parseInt(args[0]); String input = args[1]; String output = args[2]; final Broadcast<Integer> brodcastWindow = sc.broadcast(neighborWindow); JavaRDD<String> rawData = sc.textFile(input); Dataset<Row> rfDataset = spark.createDataFrame(rowRDD, rfSchema); rfDataset.createOrReplaceTempView("rfTable"); Dataset<Row> sqlResult = spark.sql(query); sqlResult.show(); // print first 20 records on the console sqlResult.write().parquet(output + "/parquetFormat"); // saves output in compressed Parquet format, recommended for large projects. sqlResult.rdd().saveAsTextFile(output + "/textFormat"); // to see output via cat command sc.close(); spark.stop();
@Test public void saveAndLoad() { Map<String, String> options = new HashMap<>(); options.put("path", path.toString()); df.write().mode(SaveMode.ErrorIfExists).format("json").options(options).save(); Dataset<Row> loadedDF = spark.read().format("json").options(options).load(); checkAnswer(loadedDF, df.collectAsList()); }
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
@Before public void setUp() { spark = SparkSession.builder() .master("local[*]") .appName("testing") .getOrCreate(); jsc = new JavaSparkContext(spark.sparkContext()); }
@Test public void applySchema() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("name", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<Row> actual = spark.sql("SELECT * FROM people").collectAsList(); List<Row> expected = new ArrayList<>(2); expected.add(RowFactory.create("Michael", 29)); expected.add(RowFactory.create("Yin", 28)); Assert.assertEquals(expected, actual); }
@Test public void applySchemaToJSON() { Dataset<String> jsonDS = spark.createDataset(Arrays.asList( "{\"string\":\"this is a simple string.\", \"integer\":10, \"long\":21474836470, " + "\"bigInteger\":92233720368547758070, \"double\":1.7976931348623157E308, " + List<Row> expectedResult = new ArrayList<>(2); expectedResult.add( RowFactory.create( new BigDecimal("92233720368547758070"), true, "this is another simple string.")); Dataset<Row> df1 = spark.read().json(jsonDS); StructType actualSchema1 = df1.schema(); Assert.assertEquals(expectedSchema, actualSchema1); df1.createOrReplaceTempView("jsonTable1"); List<Row> actual1 = spark.sql("select * from jsonTable1").collectAsList(); Assert.assertEquals(expectedResult, actual1); Dataset<Row> df2 = spark.read().schema(expectedSchema).json(jsonDS); StructType actualSchema2 = df2.schema(); Assert.assertEquals(expectedSchema, actualSchema2); df2.createOrReplaceTempView("jsonTable2"); List<Row> actual2 = spark.sql("select * from jsonTable2").collectAsList(); Assert.assertEquals(expectedResult, actual2);
@Test public void javaCompatibilityTest() { double[] input = new double[]{1D, 2D, 3D, 4D}; Dataset<Row> dataset = spark.createDataFrame( Arrays.asList(RowFactory.create(Vectors.dense(input))), new StructType(new StructField[]{ new StructField("vec", (new VectorUDT()), false, Metadata.empty()) })); double[] expectedResult = input.clone(); (new DoubleDCT_1D(input.length)).forward(expectedResult, true); DCT dct = new DCT() .setInputCol("vec") .setOutputCol("resultVec"); List<Row> result = dct.transform(dataset).select("resultVec").collectAsList(); Vector resultVec = result.get(0).getAs("resultVec"); Assert.assertArrayEquals(expectedResult, resultVec.toArray(), 1e-6); } }
.builder() .appName("knn") .getOrCreate(); JavaSparkContext context = JavaSparkContext.fromSparkContext(session.sparkContext()); final Broadcast<Integer> broadcastK = context.broadcast(k); final Broadcast<Integer> broadcastD = context.broadcast(d); JavaRDD<String> R = session.read().textFile(datasetR).javaRDD(); R.saveAsTextFile(outputPath+"/R"); JavaRDD<String> S = session.read().textFile(datasetS).javaRDD(); S.saveAsTextFile(outputPath+"/S"); session.stop(); System.exit(0);
@Test public void bucketizerTest() { double[] splits = {-0.5, 0.0, 0.5}; StructType schema = new StructType(new StructField[]{ new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame( Arrays.asList( RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2)), schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("feature") .setOutputCol("result") .setSplits(splits); List<Row> result = bucketizer.transform(dataset).select("result").collectAsList(); for (Row r : result) { double index = r.getDouble(0); Assert.assertTrue((index >= 0) && (index <= 1)); } }
@Test public void testNaiveBayes() { List<Row> data = Arrays.asList( RowFactory.create(0.0, Vectors.dense(1.0, 0.0, 0.0)), RowFactory.create(0.0, Vectors.dense(2.0, 0.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 1.0, 0.0)), RowFactory.create(1.0, Vectors.dense(0.0, 2.0, 0.0)), RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 1.0)), RowFactory.create(2.0, Vectors.dense(0.0, 0.0, 2.0))); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(data, schema); NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial"); NaiveBayesModel model = nb.fit(dataset); Dataset<Row> predictionAndLabels = model.transform(dataset).select("prediction", "label"); validatePrediction(predictionAndLabels); } }
@SuppressWarnings("unchecked") @Test public void udf3Test() { spark.udf().registerJava("stringLengthTest", StringLengthTest.class.getName(), DataTypes.IntegerType); Row result = spark.sql("SELECT stringLengthTest('test', 'test2')").head(); Assert.assertEquals(9, result.getInt(0)); // returnType is not provided spark.udf().registerJava("stringLengthTest2", StringLengthTest.class.getName(), null); result = spark.sql("SELECT stringLengthTest('test', 'test2')").head(); Assert.assertEquals(9, result.getInt(0)); }
@Override public void setUp() throws IOException { super.setUp(); List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42); datasetRDD = jsc.parallelize(points, 2); dataset = spark.createDataFrame(datasetRDD, LabeledPoint.class); dataset.createOrReplaceTempView("dataset"); }
@Before public void setUp() throws IOException { spark = SparkSession.builder() .master("local[*]") .appName("testing") .getOrCreate(); path = Utils.createTempDir(System.getProperty("java.io.tmpdir"), "datasource").getCanonicalFile(); if (path.exists()) { path.delete(); } List<String> jsonObjects = new ArrayList<>(10); for (int i = 0; i < 10; i++) { jsonObjects.add("{\"a\":" + i + ", \"b\":\"str" + i + "\"}"); } Dataset<String> ds = spark.createDataset(jsonObjects, Encoders.STRING()); df = spark.read().json(ds); df.createOrReplaceTempView("jsonTable"); }
@Test public void javaCompatibilityTest() { StopWordsRemover remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered"); List<Row> data = Arrays.asList( RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")), RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb")) ); StructType schema = new StructType(new StructField[]{ new StructField("raw", DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame(data, schema); remover.transform(dataset).collect(); } }
@Override public void setUp() throws IOException { super.setUp(); List<LabeledPoint> points = generateLogisticInputAsList(1.0, 1.0, 100, 42); dataset = spark.createDataFrame(jsc.parallelize(points, 2), LabeledPoint.class); }
@Test public void testConvertMatrixColumnsToAndFromML() { Matrix x = Matrices.dense(2, 1, new double[]{1.0, 2.0}); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new MatrixUDT(), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame( Arrays.asList( RowFactory.create(1.0, x)), schema); Dataset<Row> newDataset1 = MLUtils.convertMatrixColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertMatrixColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertMatrixColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); } }
@SuppressWarnings("unchecked") @Test public void udf1Test() { spark.range(1, 10).toDF("value").createOrReplaceTempView("df"); spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName()); Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head(); Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6); }