org.apache.spark.sql.Dataset.groupBy java code examples

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

@Test
public void pivot() {
 Dataset<Row> df = spark.table("courseSales");
 List<Row> actual = df.groupBy("year")
  .pivot("course", Arrays.asList("dotNET", "Java"))
  .agg(sum("earnings")).orderBy("year").collectAsList();
 Assert.assertEquals(2012, actual.get(0).getInt(0));
 Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01);
 Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01);
 Assert.assertEquals(2013, actual.get(1).getInt(0));
 Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01);
 Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01);
}

@Test
public void pivot() {
 Dataset<Row> df = spark.table("courseSales");
 List<Row> actual = df.groupBy("year")
  .pivot("course", Arrays.asList("dotNET", "Java"))
  .agg(sum("earnings")).orderBy("year").collectAsList();
 Assert.assertEquals(2012, actual.get(0).getInt(0));
 Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01);
 Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01);
 Assert.assertEquals(2013, actual.get(1).getInt(0));
 Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01);
 Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01);
}

@Test
public void pivot() {
 Dataset<Row> df = spark.table("courseSales");
 List<Row> actual = df.groupBy("year")
  .pivot("course", Arrays.asList("dotNET", "Java"))
  .agg(sum("earnings")).orderBy("year").collectAsList();
 Assert.assertEquals(2012, actual.get(0).getInt(0));
 Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01);
 Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01);
 Assert.assertEquals(2013, actual.get(1).getInt(0));
 Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01);
 Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01);
}

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().avg("key");
df.groupBy().mean("key");
df.groupBy().max("key");
df.groupBy().min("key");
df.groupBy().sum("key");
df.groupBy().agg(countDistinct("key", "value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));

df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().avg("key");
df.groupBy().mean("key");
df.groupBy().max("key");
df.groupBy().min("key");
df.groupBy().sum("key");
df.groupBy().agg(countDistinct("key", "value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));

df.orderBy(col("key"), col("value"));
df.groupBy("key", "value").agg(col("key"), col("value"), sum("value"));
df.groupBy(col("key"), col("value")).agg(col("key"), col("value"), sum("value"));
df.agg(first("key"), sum("value"));
df.groupBy().avg("key");
df.groupBy().mean("key");
df.groupBy().max("key");
df.groupBy().min("key");
df.groupBy().sum("key");
df.groupBy().agg(countDistinct("key", "value"));
df.groupBy().agg(countDistinct(col("key"), col("value")));
df.select(coalesce(col("key")));

  public List<Count> count() {
    String input = "hello world hello hello hello";
    String[] _words = input.split(" ");
    List<Word> words = Arrays.stream(_words).map(Word::new).collect(Collectors.toList());
    Dataset<Row> dataFrame = sparkSession.createDataFrame(words, Word.class);
    dataFrame.show();
    //StructType structType = dataFrame.schema();

    RelationalGroupedDataset groupedDataset = dataFrame.groupBy(col("word"));
    groupedDataset.count().show();
    List<Row> rows = groupedDataset.count().collectAsList();//JavaConversions.asScalaBuffer(words)).count();
    return rows.stream().map(new Function<Row, Count>() {
      @Override
      public Count apply(Row row) {
        return new Count(row.getString(0), row.getLong(1));
      }
    }).collect(Collectors.toList());
  }
}

/**
 * Standard deviation for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the standard
 *                   deviation for
 * @return the column that represents the standard deviation
 */
public static Column var(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName);
}

/**
 * MIn for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the min for
 * @return the column that represents the min
 */
public static Column min(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName);
}

/**
 * Mean for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column fron
 * @param columnName the name of the column to get the mean for
 * @return the column that represents the mean
 */
public static Column mean(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName);
}

/**
 * Max for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column
 *                   to get the max for
 * @return the column that represents the max
 */
public static Column max(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName);
}

/**
 * Standard deviation for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the standard
 *                   deviation for
 * @return the column that represents the standard deviation
 */
public static Column var(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName);
}

/**
 * Max for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column
 *                   to get the max for
 * @return the column that represents the max
 */
public static Column max(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName);
}

/**
 * MIn for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the min for
 * @return the column that represents the min
 */
public static Column min(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName);
}

/**
 * Mean for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column fron
 * @param columnName the name of the column to get the mean for
 * @return the column that represents the mean
 */
public static Column mean(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName);
}

@Override
public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) {
 if (isDependency()) {
  Dataset<Row> expectedDependency = stepDependencies.get(dependency);
  if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1
    && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) {
   expected = expectedDependency.collectAsList().get(0).getLong(0);
  } else {
   throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type");
  }
 }
 if (expected < 0) {
  throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency");
 }
 return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA));
}

 @Test
 public void testUDAF() {
  Dataset<Row> df = hc.range(0, 100).union(hc.range(0, 100)).select(col("id").as("value"));
  UserDefinedAggregateFunction udaf = new MyDoubleSum();
  UserDefinedAggregateFunction registeredUDAF = hc.udf().register("mydoublesum", udaf);
  // Create Columns for the UDAF. For now, callUDF does not take an argument to specific if
  // we want to use distinct aggregation.
  Dataset<Row> aggregatedDF =
   df.groupBy()
    .agg(
     udaf.distinct(col("value")),
     udaf.apply(col("value")),
     registeredUDAF.apply(col("value")),
     callUDF("mydoublesum", col("value")));

  List<Row> expectedResult = new ArrayList<>();
  expectedResult.add(RowFactory.create(4950.0, 9900.0, 9900.0, 9900.0));
  checkAnswer(
   aggregatedDF,
   expectedResult);
 }
}

Popular methods of Dataset

Popular in Java

Running tasks concurrently on multiple threads
scheduleAtFixedRate (ScheduledExecutorService)
setScale (BigDecimal)
scheduleAtFixedRate (Timer)
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
Map (java.util)
A Map is a data structure consisting of a set of keys and values in which each key is mapped to a si
ThreadPoolExecutor (java.util.concurrent)
An ExecutorService that executes each submitted task using one of possibly several pooled threads, n
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
JPanel (javax.swing)
Top Vim plugins

How to use groupBymethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.groupBy (Showing top 20 results out of 315)

How to use
groupBy
method
in
org.apache.spark.sql.Dataset