@Test public void keyBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2)); List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect(); assertEquals(new Tuple2<>("1", 1), s.get(0)); assertEquals(new Tuple2<>("2", 2), s.get(1)); }
@Test public void keyBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2)); List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect(); Assert.assertEquals(new Tuple2<>("1", 1), s.get(0)); Assert.assertEquals(new Tuple2<>("2", 2), s.get(1)); }
@Test public void keyBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2)); List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect(); assertEquals(new Tuple2<>("1", 1), s.get(0)); assertEquals(new Tuple2<>("2", 2), s.get(1)); }
@Test public void keyBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2)); List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect(); Assert.assertEquals(new Tuple2<>("1", 1), s.get(0)); Assert.assertEquals(new Tuple2<>("2", 2), s.get(1)); }
@Test public void keyBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2)); List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect(); Assert.assertEquals(new Tuple2<>("1", 1), s.get(0)); Assert.assertEquals(new Tuple2<>("2", 2), s.get(1)); }
@Test public void keyBy() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2)); List<Tuple2<String, Integer>> s = rdd.keyBy(Object::toString).collect(); assertEquals(new Tuple2<>("1", 1), s.get(0)); assertEquals(new Tuple2<>("2", 2), s.get(1)); }
@Test public void combineByKey() { JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6)); Function<Integer, Integer> keyFunction = v1 -> v1 % 3; Function<Integer, Integer> createCombinerFunction = v1 -> v1; Function2<Integer, Integer, Integer> mergeValueFunction = (v1, v2) -> v1 + v2; JavaPairRDD<Integer, Integer> combinedRDD = originalRDD.keyBy(keyFunction) .combineByKey(createCombinerFunction, mergeValueFunction, mergeValueFunction); Map<Integer, Integer> results = combinedRDD.collectAsMap(); ImmutableMap<Integer, Integer> expected = ImmutableMap.of(0, 9, 1, 5, 2, 7); assertEquals(expected, results); Partitioner defaultPartitioner = Partitioner.defaultPartitioner( combinedRDD.rdd(), JavaConverters.collectionAsScalaIterableConverter( Collections.<RDD<?>>emptyList()).asScala().toSeq()); combinedRDD = originalRDD.keyBy(keyFunction) .combineByKey( createCombinerFunction, mergeValueFunction, mergeValueFunction, defaultPartitioner, false, new KryoSerializer(new SparkConf())); results = combinedRDD.collectAsMap(); assertEquals(expected, results); }
@Test public void combineByKey() { JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6)); Function<Integer, Integer> keyFunction = v1 -> v1 % 3; Function<Integer, Integer> createCombinerFunction = v1 -> v1; Function2<Integer, Integer, Integer> mergeValueFunction = (v1, v2) -> v1 + v2; JavaPairRDD<Integer, Integer> combinedRDD = originalRDD.keyBy(keyFunction) .combineByKey(createCombinerFunction, mergeValueFunction, mergeValueFunction); Map<Integer, Integer> results = combinedRDD.collectAsMap(); ImmutableMap<Integer, Integer> expected = ImmutableMap.of(0, 9, 1, 5, 2, 7); assertEquals(expected, results); Partitioner defaultPartitioner = Partitioner.defaultPartitioner( combinedRDD.rdd(), JavaConverters.collectionAsScalaIterableConverter( Collections.<RDD<?>>emptyList()).asScala().toSeq()); combinedRDD = originalRDD.keyBy(keyFunction) .combineByKey( createCombinerFunction, mergeValueFunction, mergeValueFunction, defaultPartitioner, false, new KryoSerializer(new SparkConf())); results = combinedRDD.collectAsMap(); assertEquals(expected, results); }
@Test public void combineByKey() { JavaRDD<Integer> originalRDD = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6)); Function<Integer, Integer> keyFunction = v1 -> v1 % 3; Function<Integer, Integer> createCombinerFunction = v1 -> v1; Function2<Integer, Integer, Integer> mergeValueFunction = (v1, v2) -> v1 + v2; JavaPairRDD<Integer, Integer> combinedRDD = originalRDD.keyBy(keyFunction) .combineByKey(createCombinerFunction, mergeValueFunction, mergeValueFunction); Map<Integer, Integer> results = combinedRDD.collectAsMap(); ImmutableMap<Integer, Integer> expected = ImmutableMap.of(0, 9, 1, 5, 2, 7); assertEquals(expected, results); Partitioner defaultPartitioner = Partitioner.defaultPartitioner( combinedRDD.rdd(), JavaConverters.collectionAsScalaIterableConverter( Collections.<RDD<?>>emptyList()).asScala().toSeq()); combinedRDD = originalRDD.keyBy(keyFunction) .combineByKey( createCombinerFunction, mergeValueFunction, mergeValueFunction, defaultPartitioner, false, new KryoSerializer(new SparkConf())); results = combinedRDD.collectAsMap(); assertEquals(expected, results); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(intoDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-into dependency"); } Dataset<Row> into = dependencies.get(intoDependency); if (!dependencies.containsKey(fromDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-from dependency"); } Dataset<Row> from = dependencies.get(fromDependency); ExtractFieldsFunction extractFieldsFunction = new ExtractFieldsFunction(keyFieldNames); JavaPairRDD<List<Object>, Row> keyedIntoRDD = into.javaRDD().keyBy(extractFieldsFunction); JavaPairRDD<List<Object>, Row> keyedFromRDD = from.javaRDD().keyBy(extractFieldsFunction); NestFunction nestFunction = new NestFunction(); JavaRDD<Row> nestedRDD = keyedIntoRDD.cogroup(keyedFromRDD).values().map(nestFunction); StructType nestedSchema = into.schema().add(nestedFieldName, DataTypes.createArrayType(from.schema())); Dataset<Row> nested = into.sqlContext().createDataFrame(nestedRDD, nestedSchema); return nested; }
isEntity, groupByColumns, columnToPaths, aggregatorJson, gafferGroupObjectConverter); final JavaRDD<Row> aggregatedRDD = data.javaRDD() .keyBy(keyExtractor) .reduceByKey(aggregator) .values();
private JavaRDD<Row> planMutationsByKey(Dataset<Row> arriving, List<String> keyFieldNames, Config plannerConfig, Config outputConfig) { JavaPairRDD<Row, Row> keyedArriving = arriving.javaRDD().keyBy(new ExtractKeyFunction(keyFieldNames, accumulators)); JavaPairRDD<Row, Iterable<Row>> arrivingByKey = keyedArriving.groupByKey(getPartitioner(keyedArriving)); JavaPairRDD<Row, Tuple2<Iterable<Row>, Iterable<Row>>> arrivingAndExistingByKey = arrivingByKey.mapPartitionsToPair(new JoinExistingForKeysFunction(outputConfig, keyFieldNames, accumulators)); JavaRDD<Row> planned = arrivingAndExistingByKey.flatMap(new PlanForKeyFunction(plannerConfig, accumulators)); return planned; }
private JavaPairRDD<Row, Row> getDummyRDD(int numPartitions) { return Contexts.getSparkSession().range(numPartitions).javaRDD() .map(new LongToRowFunction()).keyBy(new ItselfFunction<Row>()).repartition(numPartitions); }
.keyBy(new extractData<Type0, Type1, Input>(get0Pivot_, get0Ref_)).sortByKey(new Data.Comparator<Type0, Type1>(list1ASC, list1ASCSec)); JavaPairRDD<Data<Type0, Type1>, Tuple2<Long, Input>> keyedDataRDD2 = inputRDD2UID .keyBy(new extractData<Type0, Type1, Input>(get1Pivot_, get1Ref_)).sortByKey(new Data.Comparator<Type0, Type1>(list2ASC, list2ASCSec)); .keyBy(in -> in.getPartitionID()); JavaPairRDD<Long, List2AttributesObjectSkinny<Type0, Type1>> listObjectDataRDD2Indexd = listObjectDataRDD2 .keyBy(in -> in.getPartitionID()); r1RowIDS = inputRDD1UID.keyBy(in -> in._1()); r2RowIDS = inputRDD2UID.keyBy(in -> in._1());