@Test public void testCorrelation() { Dataset<Row> df = spark.table("testData2"); Double pearsonCorr = df.stat().corr("a", "b", "pearson"); Assert.assertTrue(Math.abs(pearsonCorr) < 1.0e-6); }
@Test public void testCovariance() { Dataset<Row> df = spark.table("testData2"); Double result = df.stat().cov("a", "b"); Assert.assertTrue(Math.abs(result) < 1.0e-6); }
@Test public void testFrequentItems() { Dataset<Row> df = spark.table("testData2"); String[] cols = {"a"}; Dataset<Row> results = df.stat().freqItems(cols, 0.2); Assert.assertTrue(results.collectAsList().get(0).getSeq(0).contains(1)); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
@Test public void testCountMinSketch() { Dataset<Long> df = spark.range(1000); CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42); Assert.assertEquals(1000, sketch1.totalCount()); Assert.assertEquals(10, sketch1.depth()); Assert.assertEquals(20, sketch1.width()); CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42); Assert.assertEquals(1000, sketch2.totalCount()); Assert.assertEquals(10, sketch2.depth()); Assert.assertEquals(20, sketch2.width()); CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42); Assert.assertEquals(1000, sketch3.totalCount()); Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3); CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42); Assert.assertEquals(1000, sketch4.totalCount()); Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
@Test public void testCountMinSketch() { Dataset<Long> df = spark.range(1000); CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42); Assert.assertEquals(1000, sketch1.totalCount()); Assert.assertEquals(10, sketch1.depth()); Assert.assertEquals(20, sketch1.width()); CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42); Assert.assertEquals(1000, sketch2.totalCount()); Assert.assertEquals(10, sketch2.depth()); Assert.assertEquals(20, sketch2.width()); CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42); Assert.assertEquals(1000, sketch3.totalCount()); Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3); CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42); Assert.assertEquals(1000, sketch4.totalCount()); Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
@Test public void testFrequentItems() { Dataset<Row> df = spark.table("testData2"); String[] cols = {"a"}; Dataset<Row> results = df.stat().freqItems(cols, 0.2); Assert.assertTrue(results.collectAsList().get(0).getSeq(0).contains(1)); }
@Test public void testCovariance() { Dataset<Row> df = spark.table("testData2"); Double result = df.stat().cov("a", "b"); Assert.assertTrue(Math.abs(result) < 1.0e-6); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void testCorrelation() { Dataset<Row> df = spark.table("testData2"); Double pearsonCorr = df.stat().corr("a", "b", "pearson"); Assert.assertTrue(Math.abs(pearsonCorr) < 1.0e-6); }
@Test public void testCrosstab() { Dataset<Row> df = spark.table("testData2"); Dataset<Row> crosstab = df.stat().crosstab("a", "b"); String[] columnNames = crosstab.schema().fieldNames(); Assert.assertEquals("a_b", columnNames[0]); Assert.assertEquals("1", columnNames[1]); Assert.assertEquals("2", columnNames[2]); List<Row> rows = crosstab.collectAsList(); rows.sort(crosstabRowComparator); Integer count = 1; for (Row row : rows) { Assert.assertEquals(row.get(0).toString(), count.toString()); Assert.assertEquals(1L, row.getLong(1)); Assert.assertEquals(1L, row.getLong(2)); count++; } }
@Test public void testCountMinSketch() { Dataset<Long> df = spark.range(1000); CountMinSketch sketch1 = df.stat().countMinSketch("id", 10, 20, 42); Assert.assertEquals(1000, sketch1.totalCount()); Assert.assertEquals(10, sketch1.depth()); Assert.assertEquals(20, sketch1.width()); CountMinSketch sketch2 = df.stat().countMinSketch(col("id"), 10, 20, 42); Assert.assertEquals(1000, sketch2.totalCount()); Assert.assertEquals(10, sketch2.depth()); Assert.assertEquals(20, sketch2.width()); CountMinSketch sketch3 = df.stat().countMinSketch("id", 0.001, 0.99, 42); Assert.assertEquals(1000, sketch3.totalCount()); Assert.assertEquals(0.001, sketch3.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch3.confidence(), 5.0e-3); CountMinSketch sketch4 = df.stat().countMinSketch(col("id"), 0.001, 0.99, 42); Assert.assertEquals(1000, sketch4.totalCount()); Assert.assertEquals(0.001, sketch4.relativeError(), 1.0e-4); Assert.assertEquals(0.99, sketch4.confidence(), 5.0e-3); }
@Test public void testBloomFilter() { Dataset<Long> df = spark.range(1000); BloomFilter filter1 = df.stat().bloomFilter("id", 1000, 0.03); Assert.assertTrue(filter1.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter1.mightContain(i)); } BloomFilter filter2 = df.stat().bloomFilter(col("id").multiply(3), 1000, 0.03); Assert.assertTrue(filter2.expectedFpp() - 0.03 < 1e-3); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter2.mightContain(i * 3)); } BloomFilter filter3 = df.stat().bloomFilter("id", 1000, 64 * 5); Assert.assertEquals(64 * 5, filter3.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter3.mightContain(i)); } BloomFilter filter4 = df.stat().bloomFilter(col("id").multiply(3), 1000, 64 * 5); Assert.assertEquals(64 * 5, filter4.bitSize()); for (int i = 0; i < 1000; i++) { Assert.assertTrue(filter4.mightContain(i * 3)); } }
@Test public void testFrequentItems() { Dataset<Row> df = spark.table("testData2"); String[] cols = {"a"}; Dataset<Row> results = df.stat().freqItems(cols, 0.2); Assert.assertTrue(results.collectAsList().get(0).getSeq(0).contains(1)); }
@Test public void testCovariance() { Dataset<Row> df = spark.table("testData2"); Double result = df.stat().cov("a", "b"); Assert.assertTrue(Math.abs(result) < 1.0e-6); }