@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); Row first = df.select("a", "b", "c", "d", "e").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); Row first = df.select("a", "b", "c", "d", "e").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
Assert.assertEquals(new StructField("e", DataTypes.createDecimalType(38,0), true, Metadata.empty()), schema.apply("e")); Row first = df.select("a", "b", "c", "d", "e").first(); Assert.assertEquals(bean.getA(), first.getDouble(0), 0.0);
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }
@Test public void testConvertMatrixColumnsToAndFromML() { Matrix x = Matrices.dense(2, 1, new double[]{1.0, 2.0}); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new MatrixUDT(), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame( Arrays.asList( RowFactory.create(1.0, x)), schema); Dataset<Row> newDataset1 = MLUtils.convertMatrixColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertMatrixColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertMatrixColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); } }
@Test public void testConvertMatrixColumnsToAndFromML() { Matrix x = Matrices.dense(2, 1, new double[]{1.0, 2.0}); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new MatrixUDT(), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame( Arrays.asList( RowFactory.create(1.0, x)), schema); Dataset<Row> newDataset1 = MLUtils.convertMatrixColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertMatrixColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertMatrixColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); } }
@Test public void testConvertMatrixColumnsToAndFromML() { Matrix x = Matrices.dense(2, 1, new double[]{1.0, 2.0}); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new MatrixUDT(), false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame( Arrays.asList( RowFactory.create(1.0, x)), schema); Dataset<Row> newDataset1 = MLUtils.convertMatrixColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertMatrixColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertMatrixColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); } }
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
@Test public void verifyLibSVMDF() { Dataset<Row> dataset = spark.read().format("libsvm").option("vectorType", "dense") .load(path); Assert.assertEquals("label", dataset.columns()[0]); Assert.assertEquals("features", dataset.columns()[1]); Row r = dataset.first(); Assert.assertEquals(1.0, r.getDouble(0), 1e-15); DenseVector v = r.getAs(1); Assert.assertEquals(Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0), v); } }
@Test public void testSummarizer() { dataset.select(col("features")); Row result = dataset .select(Summarizer.metrics("mean", "max", "count").summary(col("features"))) .first().getStruct(0); Vector meanVec = result.getAs("mean"); Vector maxVec = result.getAs("max"); long count = result.getAs("count"); assertEquals(2L, count); assertArrayEquals(new double[]{2.0, 3.0}, meanVec.toArray(), 0.0); assertArrayEquals(new double[]{3.0, 4.0}, maxVec.toArray(), 0.0); } }
@Test public void testSummarizer() { dataset.select(col("features")); Row result = dataset .select(Summarizer.metrics("mean", "max", "count").summary(col("features"))) .first().getStruct(0); Vector meanVec = result.getAs("mean"); Vector maxVec = result.getAs("max"); long count = result.getAs("count"); assertEquals(2L, count); assertArrayEquals(new double[]{2.0, 3.0}, meanVec.toArray(), 0.0); assertArrayEquals(new double[]{3.0, 4.0}, maxVec.toArray(), 0.0); } }
@Test public void testVectorAssembler() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("x", DoubleType, false), createStructField("y", new VectorUDT(), false), createStructField("name", StringType, false), createStructField("z", new VectorUDT(), false), createStructField("n", LongType, false) }); Row row = RowFactory.create( 0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, new int[]{1}, new double[]{3.0}), 10L); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"x", "y", "z", "n"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); Assert.assertEquals( Vectors.sparse(6, new int[]{1, 2, 4, 5}, new double[]{1.0, 2.0, 3.0, 10.0}), output.select("features").first().<Vector>getAs(0)); } }
@Test public void testVectorAssembler() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("x", DoubleType, false), createStructField("y", new VectorUDT(), false), createStructField("name", StringType, false), createStructField("z", new VectorUDT(), false), createStructField("n", LongType, false) }); Row row = RowFactory.create( 0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, new int[]{1}, new double[]{3.0}), 10L); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"x", "y", "z", "n"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); Assert.assertEquals( Vectors.sparse(6, new int[]{1, 2, 4, 5}, new double[]{1.0, 2.0, 3.0, 10.0}), output.select("features").first().<Vector>getAs(0)); } }
@Test public void testVectorAssembler() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("x", DoubleType, false), createStructField("y", new VectorUDT(), false), createStructField("name", StringType, false), createStructField("z", new VectorUDT(), false), createStructField("n", LongType, false) }); Row row = RowFactory.create( 0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, new int[]{1}, new double[]{3.0}), 10L); Dataset<Row> dataset = spark.createDataFrame(Arrays.asList(row), schema); VectorAssembler assembler = new VectorAssembler() .setInputCols(new String[]{"x", "y", "z", "n"}) .setOutputCol("features"); Dataset<Row> output = assembler.transform(dataset); Assert.assertEquals( Vectors.sparse(6, new int[]{1, 2, 4, 5}, new double[]{1.0, 2.0, 3.0, 10.0}), output.select("features").first().<Vector>getAs(0)); } }