protected KeyValueGroupedDataset<String, Tuple2<String, Integer>> generateGroupedDataset() { Encoder<Tuple2<String, Integer>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.INT()); List<Tuple2<String, Integer>> data = Arrays.asList(new Tuple2<>("a", 1), new Tuple2<>("a", 2), new Tuple2<>("b", 3)); Dataset<Tuple2<String, Integer>> ds = spark.createDataset(data, encoder); return ds.groupByKey((MapFunction<Tuple2<String, Integer>, String>) value -> value._1(), Encoders.STRING()); } }
protected KeyValueGroupedDataset<String, Tuple2<String, Integer>> generateGroupedDataset() { Encoder<Tuple2<String, Integer>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.INT()); List<Tuple2<String, Integer>> data = Arrays.asList(new Tuple2<>("a", 1), new Tuple2<>("a", 2), new Tuple2<>("b", 3)); Dataset<Tuple2<String, Integer>> ds = spark.createDataset(data, encoder); return ds.groupByKey((MapFunction<Tuple2<String, Integer>, String>) value -> value._1(), Encoders.STRING()); } }
protected KeyValueGroupedDataset<String, Tuple2<String, Integer>> generateGroupedDataset() { Encoder<Tuple2<String, Integer>> encoder = Encoders.tuple(Encoders.STRING(), Encoders.INT()); List<Tuple2<String, Integer>> data = Arrays.asList(new Tuple2<>("a", 1), new Tuple2<>("a", 2), new Tuple2<>("b", 3)); Dataset<Tuple2<String, Integer>> ds = spark.createDataset(data, encoder); return ds.groupByKey((MapFunction<Tuple2<String, Integer>, String>) value -> value._1(), Encoders.STRING()); } }
@Test public void testRandomSplit() { List<String> data = Arrays.asList("hello", "world", "from", "spark"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); double[] arraySplit = {1, 2, 3}; List<Dataset<String>> randomSplit = ds.randomSplitAsList(arraySplit, 1); Assert.assertEquals("wrong number of splits", randomSplit.size(), 3); }
@Test public void testCollect() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); List<String> collected = ds.collectAsList(); Assert.assertEquals(Arrays.asList("hello", "world"), collected); }
@Test public void testTake() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); List<String> collected = ds.takeAsList(1); Assert.assertEquals(Arrays.asList("hello"), collected); }
@Test public void testTake() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); List<String> collected = ds.takeAsList(1); Assert.assertEquals(Arrays.asList("hello"), collected); }
@Test public void testRandomSplit() { List<String> data = Arrays.asList("hello", "world", "from", "spark"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); double[] arraySplit = {1, 2, 3}; List<Dataset<String>> randomSplit = ds.randomSplitAsList(arraySplit, 1); Assert.assertEquals("wrong number of splits", randomSplit.size(), 3); }
@Test public void testCollect() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); List<String> collected = ds.collectAsList(); Assert.assertEquals(Arrays.asList("hello", "world"), collected); }
@Test public void testRandomSplit() { List<String> data = Arrays.asList("hello", "world", "from", "spark"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); double[] arraySplit = {1, 2, 3}; List<Dataset<String>> randomSplit = ds.randomSplitAsList(arraySplit, 1); Assert.assertEquals("wrong number of splits", randomSplit.size(), 3); }
@Test public void testTake() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); List<String> collected = ds.takeAsList(1); Assert.assertEquals(Arrays.asList("hello"), collected); }
@Test public void testToLocalIterator() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Iterator<String> iter = ds.toLocalIterator(); Assert.assertEquals("hello", iter.next()); Assert.assertEquals("world", iter.next()); Assert.assertFalse(iter.hasNext()); }
@Test public void testToLocalIterator() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Iterator<String> iter = ds.toLocalIterator(); Assert.assertEquals("hello", iter.next()); Assert.assertEquals("world", iter.next()); Assert.assertFalse(iter.hasNext()); }
@Test public void testToLocalIterator() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Iterator<String> iter = ds.toLocalIterator(); Assert.assertEquals("hello", iter.next()); Assert.assertEquals("world", iter.next()); Assert.assertFalse(iter.hasNext()); }
@Test public void testForeach() { LongAccumulator accum = jsc.sc().longAccumulator(); List<String> data = Arrays.asList("a", "b", "c"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); ds.foreach((ForeachFunction<String>) s -> accum.add(1)); Assert.assertEquals(3, accum.value().intValue()); }
@Test public void testForeach() { LongAccumulator accum = jsc.sc().longAccumulator(); List<String> data = Arrays.asList("a", "b", "c"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); ds.foreach((ForeachFunction<String>) s -> accum.add(1)); Assert.assertEquals(3, accum.value().intValue()); }
@Test public void testForeach() { LongAccumulator accum = jsc.sc().longAccumulator(); List<String> data = Arrays.asList("a", "b", "c"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); ds.foreach((ForeachFunction<String>) s -> accum.add(1)); Assert.assertEquals(3, accum.value().intValue()); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select( expr("value + 1"), col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals( Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select( expr("value + 1"), col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals( Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select( expr("value + 1"), col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals( Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }