@SuppressWarnings("unchecked") @Test public void udf1Test() { spark.range(1, 10).toDF("value").createOrReplaceTempView("df"); spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName()); Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head(); Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6); }
@SuppressWarnings("unchecked") @Test public void udf1Test() { spark.range(1, 10).toDF("value").createOrReplaceTempView("df"); spark.udf().registerJavaUDAF("myDoubleAvg", MyDoubleAvg.class.getName()); Row result = spark.sql("SELECT myDoubleAvg(value) as my_avg from df").head(); Assert.assertEquals(105.0, result.getDouble(0), 1.0e-6); }
@SuppressWarnings("unchecked") @Test public void udf4Test() { spark.udf().register("inc", (Long i) -> i + 1, DataTypes.LongType); spark.range(10).toDF("x").createOrReplaceTempView("tmp"); // This tests when Java UDFs are required to be the semantically same (See SPARK-9435). List<Row> results = spark.sql("SELECT inc(x) FROM tmp GROUP BY inc(x)").collectAsList(); Assert.assertEquals(10, results.size()); long sum = 0; for (Row result : results) { sum += result.getLong(0); } Assert.assertEquals(55, sum); }
@SuppressWarnings("unchecked") @Test public void udf4Test() { spark.udf().register("inc", (Long i) -> i + 1, DataTypes.LongType); spark.range(10).toDF("x").createOrReplaceTempView("tmp"); // This tests when Java UDFs are required to be the semantically same (See SPARK-9435). List<Row> results = spark.sql("SELECT inc(x) FROM tmp GROUP BY inc(x)").collectAsList(); Assert.assertEquals(10, results.size()); long sum = 0; for (Row result : results) { sum += result.getLong(0); } Assert.assertEquals(55, sum); }
@SuppressWarnings("unchecked") @Test public void udf4Test() { spark.udf().register("inc", (Long i) -> i + 1, DataTypes.LongType); spark.range(10).toDF("x").createOrReplaceTempView("tmp"); // This tests when Java UDFs are required to be the semantically same (See SPARK-9435). List<Row> results = spark.sql("SELECT inc(x) FROM tmp GROUP BY inc(x)").collectAsList(); Assert.assertEquals(10, results.size()); long sum = 0; for (Row result : results) { sum += result.getLong(0); } Assert.assertEquals(55, sum); } }
@Test public void testTupleEncoderSchema() { Encoder<Tuple2<String, Tuple2<String,String>>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); List<Tuple2<String, Tuple2<String, String>>> data = Arrays.asList(tuple2("1", tuple2("a", "b")), tuple2("2", tuple2("c", "d"))); Dataset<Row> ds1 = spark.createDataset(data, encoder).toDF("value1", "value2"); JavaPairRDD<String, Tuple2<String, String>> pairRDD = jsc.parallelizePairs(data); Dataset<Row> ds2 = spark.createDataset(JavaPairRDD.toRDD(pairRDD), encoder) .toDF("value1", "value2"); Assert.assertEquals(ds1.schema(), ds2.schema()); Assert.assertEquals(ds1.select(expr("value2._1")).collectAsList(), ds2.select(expr("value2._1")).collectAsList()); }
@Test public void testTupleEncoderSchema() { Encoder<Tuple2<String, Tuple2<String,String>>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); List<Tuple2<String, Tuple2<String, String>>> data = Arrays.asList(tuple2("1", tuple2("a", "b")), tuple2("2", tuple2("c", "d"))); Dataset<Row> ds1 = spark.createDataset(data, encoder).toDF("value1", "value2"); JavaPairRDD<String, Tuple2<String, String>> pairRDD = jsc.parallelizePairs(data); Dataset<Row> ds2 = spark.createDataset(JavaPairRDD.toRDD(pairRDD), encoder) .toDF("value1", "value2"); Assert.assertEquals(ds1.schema(), ds2.schema()); Assert.assertEquals(ds1.select(expr("value2._1")).collectAsList(), ds2.select(expr("value2._1")).collectAsList()); }
@Override public Dataset<Row> toDF() { final boolean userTriggered = initializeFunction(); final Dataset<Row> result = from(super.toDF()); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> toDF(final String... colNames) { final boolean userTriggered = initializeFunction(colNames); final Dataset<Row> result = from(super.toDF(colNames)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> toDF(final scala.collection.Seq<String> colNames) { final boolean userTriggered = initializeFunction(colNames); final Dataset<Row> result = from(super.toDF(colNames)); this.setIsUserTriggered(userTriggered); return result; }
@Override public void setUp() throws IOException { super.setUp(); List<java.lang.Double> points = Arrays.asList(0.1, 1.1, 10.1, -1.1); dataset = spark.createDataset(points, Encoders.DOUBLE()).toDF("sample"); }
private void start() { SparkSession spark = SparkSession.builder() .appName("Array to Dataframe (Dataset<Row>)") .master("local") .getOrCreate(); String[] l = new String[] { "a", "b", "c", "d" }; List<String> data = Arrays.asList(l); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Dataset<Row> df = ds.toDF(); df.show(); } }
@Override public void setUp() throws IOException { super.setUp(); List<java.lang.Double> points = Arrays.asList(0.1, 1.1, 10.1, -1.1); dataset = spark.createDataset(points, Encoders.DOUBLE()).toDF("sample"); }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON array to Dataset") .master("local").getOrCreate(); String filename = "data/array.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Turns the "one liner" into a real column df = df.select(explode(df.col("valsInArrays"))).toDF("vals"); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String> stringAsList = new ArrayList<>(); stringAsList.add("bar"); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); } }
@Override public Dataset<Row> transform(Dataset<?> dataset){ StructType schema = dataset.schema(); StructType structSchema = getStructSchema(schema); Column structColumn = dataset.apply(DatasetUtil.escapeColumnName(getStructCol())); Dataset<Row> result = dataset.toDF(); StructField[] fields = structSchema.fields(); for(StructField field : fields){ String name = field.name(); Column fieldColumn = structColumn.getField(DatasetUtil.escapeColumnName(name)); result = result.withColumn(DatasetUtil.escapeColumnName(name), fieldColumn); } return result; }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class)); bookDs.show(); bookDs.printSchema(); Dataset<Row> df2 = bookDs.toDF(); df2.show(); df2.printSchema(); } }
Dataset<Row> df = spark.table("testData"); df.toDF("key1", "value1");
Dataset<Row> df = spark.table("testData"); df.toDF("key1", "value1");
Dataset<Row> df = spark.table("testData"); df.toDF("key1", "value1");