df.printSchema(); df = df.withColumn("features", callUDF("vectorBuilder", df.col( "valuefeatures"))); df.printSchema();
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); // do not combine splits because the tests expect a split per partition byId.updateProperties().set("read.split.target-size", "1").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(byId.location()); return location; }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); spark.udf().register("x2Multiplier", new Multiplier2(), DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); // registers a new internal UDF spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() { private static final long serialVersionUID = -5372447039252716846L; @Override public Integer call(Integer x) { return x * 2; } }, DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
@Test public void testUDAF() { Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value")); UserDefinedAggregateFunction udaf = new MyDoubleSum(); UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf); // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if // we want to use distinct aggregation. Dataset<Row> aggregatedDF = df.groupBy() .agg( udaf.distinct(col("value")), udaf.apply(col("value")), registeredUDAF.apply(col("value")), callUDF("mydoublesum", col("value"))); List<Row> expectedResult = new ArrayList<>(); expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0)); checkAnswer( aggregatedDF, expectedResult); } }
@Test public void testUDAF() { Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value")); UserDefinedAggregateFunction udaf = new MyDoubleSum(); UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf); // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if // we want to use distinct aggregation. Dataset<Row> aggregatedDF = df.groupBy() .agg( udaf.distinct(col("value")), udaf.apply(col("value")), registeredUDAF.apply(col("value")), callUDF("mydoublesum", col("value"))); List<Row> expectedResult = new ArrayList<>(); expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0)); checkAnswer( aggregatedDF, expectedResult); } }