@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void pivot() { Dataset<Row> df = spark.table("courseSales"); List<Row> actual = df.groupBy("year") .pivot("course", Arrays.asList("dotNET", "Java")) .agg(sum("earnings")).orderBy("year").collectAsList(); Assert.assertEquals(2012, actual.get(0).getInt(0)); Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); Assert.assertEquals(2013, actual.get(1).getInt(0)); Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); }
@Test public void pivot() { Dataset<Row> df = spark.table("courseSales"); List<Row> actual = df.groupBy("year") .pivot("course", Arrays.asList("dotNET", "Java")) .agg(sum("earnings")).orderBy("year").collectAsList(); Assert.assertEquals(2012, actual.get(0).getInt(0)); Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); Assert.assertEquals(2013, actual.get(1).getInt(0)); Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); }
@Test public void pivot() { Dataset<Row> df = spark.table("courseSales"); List<Row> actual = df.groupBy("year") .pivot("course", Arrays.asList("dotNET", "Java")) .agg(sum("earnings")).orderBy("year").collectAsList(); Assert.assertEquals(2012, actual.get(0).getInt(0)); Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); Assert.assertEquals(2013, actual.get(1).getInt(0)); Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
df.orderBy(col("key"), col("value")); df.groupBy("key", "value").agg(col("key"), col("value"), sum("value")); df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value")); df.agg(first("key"), sum("value")); df.groupBy().avg("key"); df.groupBy().mean("key"); df.groupBy().max("key"); df.groupBy().min("key"); df.groupBy().sum("key"); df.groupBy().agg(countDistinct("key", "value")); df.groupBy().agg(countDistinct(col("key"), col("value"))); df.select(coalesce(col("key")));
df.orderBy(col("key"), col("value")); df.groupBy("key", "value").agg(col("key"), col("value"), sum("value")); df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value")); df.agg(first("key"), sum("value")); df.groupBy().avg("key"); df.groupBy().mean("key"); df.groupBy().max("key"); df.groupBy().min("key"); df.groupBy().sum("key"); df.groupBy().agg(countDistinct("key", "value")); df.groupBy().agg(countDistinct(col("key"), col("value"))); df.select(coalesce(col("key")));
df.orderBy(col("key"), col("value")); df.groupBy("key", "value").agg(col("key"), col("value"), sum("value")); df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value")); df.agg(first("key"), sum("value")); df.groupBy().avg("key"); df.groupBy().mean("key"); df.groupBy().max("key"); df.groupBy().min("key"); df.groupBy().sum("key"); df.groupBy().agg(countDistinct("key", "value")); df.groupBy().agg(countDistinct(col("key"), col("value"))); df.select(coalesce(col("key")));
public List<Count> count() { String input = "hello world hello hello hello"; String[] _words = input.split(" "); List<Word> words = Arrays.stream(_words).map(Word::new).collect(Collectors.toList()); Dataset<Row> dataFrame = sparkSession.createDataFrame(words, Word.class); dataFrame.show(); //StructType structType = dataFrame.schema(); RelationalGroupedDataset groupedDataset = dataFrame.groupBy(col("word")); groupedDataset.count().show(); List<Row> rows = groupedDataset.count().collectAsList();//JavaConversions.asScalaBuffer(words)).count(); return rows.stream().map(new Function<Row, Count>() { @Override public Count apply(Row row) { return new Count(row.getString(0), row.getLong(1)); } }).collect(Collectors.toList()); } }
/** * Standard deviation for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the standard * deviation for * @return the column that represents the standard deviation */ public static Column var(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName); }
/** * MIn for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the min for * @return the column that represents the min */ public static Column min(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName); }
/** * Mean for a column * * @param dataFrame the dataframe to * get the column fron * @param columnName the name of the column to get the mean for * @return the column that represents the mean */ public static Column mean(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName); }
/** * Max for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column * to get the max for * @return the column that represents the max */ public static Column max(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName); }
/** * Standard deviation for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the standard * deviation for * @return the column that represents the standard deviation */ public static Column var(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName); }
/** * Max for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column * to get the max for * @return the column that represents the max */ public static Column max(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName); }
/** * MIn for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the min for * @return the column that represents the min */ public static Column min(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName); }
/** * Mean for a column * * @param dataFrame the dataframe to * get the column fron * @param columnName the name of the column to get the mean for * @return the column that represents the mean */ public static Column mean(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName); }
@Override public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) { if (isDependency()) { Dataset<Row> expectedDependency = stepDependencies.get(dependency); if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1 && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) { expected = expectedDependency.collectAsList().get(0).getLong(0); } else { throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type"); } } if (expected < 0) { throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency"); } return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA)); }
@Test public void testUDAF() { Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value")); UserDefinedAggregateFunction udaf = new MyDoubleSum(); UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf); // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if // we want to use distinct aggregation. Dataset<Row> aggregatedDF = df.groupBy() .agg( udaf.distinct(col("value")), udaf.apply(col("value")), registeredUDAF.apply(col("value")), callUDF("mydoublesum", col("value"))); List<Row> expectedResult = new ArrayList<>(); expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0)); checkAnswer( aggregatedDF, expectedResult); } }