@Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } });
@Test public void testUDF() { UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType); Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value"))); String[] result = df.collectAsList().stream().map(row -> row.getString(0)) .toArray(String[]::new); String[] expected = spark.table("testData").collectAsList().stream() .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new); Assert.assertArrayEquals(expected, result); } }
private static void appendValue(ColumnVector dst, DataType t, Row src, int fieldIdx) { if (t instanceof ArrayType) { ArrayType at = (ArrayType)t; if (src.isNullAt(fieldIdx)) { dst.appendNull(); } else { List<Object> values = src.getList(fieldIdx); dst.appendArray(values.size()); for (Object o : values) { appendValue(dst.arrayData(), at.elementType(), o); } } } else if (t instanceof StructType) { StructType st = (StructType)t; if (src.isNullAt(fieldIdx)) { dst.appendStruct(true); } else { dst.appendStruct(false); Row c = src.getStruct(fieldIdx); for (int i = 0; i < st.fields().length; i++) { appendValue(dst.getChildColumn(i), st.fields()[i].dataType(), c, i); } } } else { appendValue(dst, t, src.get(fieldIdx)); } }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
@Test public void testUDF() { UserDefinedFunction foo = udf((Integer i, String s) -> i.toString() + s, DataTypes.StringType); Dataset<Row> df = spark.table("testData").select(foo.apply(col("key"), col("value"))); String[] result = df.collectAsList().stream().map(row -> row.getString(0)) .toArray(String[]::new); String[] expected = spark.table("testData").collectAsList().stream() .map(row -> row.get(0).toString() + row.getString(1)).toArray(String[]::new); Assert.assertArrayEquals(expected, result); } }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
private static void appendValue(WritableColumnVector dst, DataType t, Row src, int fieldIdx) { if (t instanceof ArrayType) { ArrayType at = (ArrayType)t; if (src.isNullAt(fieldIdx)) { dst.appendNull(); } else { List<Object> values = src.getList(fieldIdx); dst.appendArray(values.size()); for (Object o : values) { appendValue(dst.arrayData(), at.elementType(), o); } } } else if (t instanceof StructType) { StructType st = (StructType)t; if (src.isNullAt(fieldIdx)) { dst.appendStruct(true); } else { dst.appendStruct(false); Row c = src.getStruct(fieldIdx); for (int i = 0; i < st.fields().length; i++) { appendValue(dst.getChild(i), st.fields()[i].dataType(), c, i); } } } else { appendValue(dst, t, src.get(fieldIdx)); } }
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
@Override public Object getObject(int columnIndex) throws SQLException { return getCurrentRow().get(columnIndex-1); }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
private static void appendValue(WritableColumnVector dst, DataType t, Row src, int fieldIdx) { if (t instanceof ArrayType) { ArrayType at = (ArrayType)t; if (src.isNullAt(fieldIdx)) { dst.appendNull(); } else { List<Object> values = src.getList(fieldIdx); dst.appendArray(values.size()); for (Object o : values) { appendValue(dst.arrayData(), at.elementType(), o); } } } else if (t instanceof StructType) { StructType st = (StructType)t; if (src.isNullAt(fieldIdx)) { dst.appendStruct(true); } else { dst.appendStruct(false); Row c = src.getStruct(fieldIdx); for (int i = 0; i < st.fields().length; i++) { appendValue(dst.getChild(i), st.fields()[i].dataType(), c, i); } } } else { appendValue(dst, t, src.get(fieldIdx)); } }
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
complexMap, null); Assert.assertEquals(simpleStringArray, complexStruct.get(0)); Assert.assertEquals(simpleMap, complexStruct.get(1)); Assert.assertEquals(simpleStruct, complexStruct.get(2)); Assert.assertEquals(arrayOfMaps, complexStruct.get(3)); Assert.assertEquals(arrayOfRows, complexStruct.get(4)); Assert.assertEquals(complexMap, complexStruct.get(5)); Assert.assertNull(complexStruct.get(6)); Assert.assertEquals(arrayOfMaps, complexRow.get(0)); Assert.assertEquals(arrayOfRows, complexRow.get(1)); Assert.assertEquals(complexMap, complexRow.get(2)); Assert.assertEquals(complexStruct, complexRow.get(3));
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
complexMap, null); Assert.assertEquals(simpleStringArray, complexStruct.get(0)); Assert.assertEquals(simpleMap, complexStruct.get(1)); Assert.assertEquals(simpleStruct, complexStruct.get(2)); Assert.assertEquals(arrayOfMaps, complexStruct.get(3)); Assert.assertEquals(arrayOfRows, complexStruct.get(4)); Assert.assertEquals(complexMap, complexStruct.get(5)); Assert.assertNull(complexStruct.get(6)); Assert.assertEquals(arrayOfMaps, complexRow.get(0)); Assert.assertEquals(arrayOfRows, complexRow.get(1)); Assert.assertEquals(complexMap, complexRow.get(2)); Assert.assertEquals(complexStruct, complexRow.get(3));
complexMap, null); Assert.assertEquals(simpleStringArray, complexStruct.get(0)); Assert.assertEquals(simpleMap, complexStruct.get(1)); Assert.assertEquals(simpleStruct, complexStruct.get(2)); Assert.assertEquals(arrayOfMaps, complexStruct.get(3)); Assert.assertEquals(arrayOfRows, complexStruct.get(4)); Assert.assertEquals(complexMap, complexStruct.get(5)); Assert.assertNull(complexStruct.get(6)); Assert.assertEquals(arrayOfMaps, complexRow.get(0)); Assert.assertEquals(arrayOfRows, complexRow.get(1)); Assert.assertEquals(complexMap, complexRow.get(2)); Assert.assertEquals(complexStruct, complexRow.get(3));
@Test public void testExecution() { Dataset<Row> df = spark.table("testData").filter("key = 1"); Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); }
@Test public void testExecution() { Dataset<Row> df = spark.table("testData").filter("key = 1"); Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); }
@Test public void testExecution() { Dataset<Row> df = spark.table("testData").filter("key = 1"); Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); }