org.apache.spark.api.java.JavaRDD.getNumPartitions java code examples

private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD,
                            boolean user) {
 int offset = user ? 0 : 1;
 Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset])
   .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions())
   .zipWithIndex().mapValues(Long::intValue)
   .collectAsMap();
 // Clone, due to some serialization problems with the result of collectAsMap?
 return new HashMap<>(reverseIDLookup);
}

@Test
public void getNumPartitions(){
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8), 3);
 JavaDoubleRDD rdd2 = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0), 2);
 JavaPairRDD<String, Integer> rdd3 = sc.parallelizePairs(
  Arrays.asList(
   new Tuple2<>("a", 1),
   new Tuple2<>("aa", 2),
   new Tuple2<>("aaa", 3)
  ),
  2);
 assertEquals(3, rdd1.getNumPartitions());
 assertEquals(2, rdd2.getNumPartitions());
 assertEquals(2, rdd3.getNumPartitions());
}

@Test
public void getNumPartitions(){
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8), 3);
 JavaDoubleRDD rdd2 = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0), 2);
 JavaPairRDD<String, Integer> rdd3 = sc.parallelizePairs(
  Arrays.asList(
   new Tuple2<>("a", 1),
   new Tuple2<>("aa", 2),
   new Tuple2<>("aaa", 3)
  ),
  2);
 assertEquals(3, rdd1.getNumPartitions());
 assertEquals(2, rdd2.getNumPartitions());
 assertEquals(2, rdd3.getNumPartitions());
}

@Test
public void getNumPartitions(){
 JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8), 3);
 JavaDoubleRDD rdd2 = sc.parallelizeDoubles(Arrays.asList(1.0, 2.0, 3.0, 4.0), 2);
 JavaPairRDD<String, Integer> rdd3 = sc.parallelizePairs(
  Arrays.asList(
   new Tuple2<>("a", 1),
   new Tuple2<>("aa", 2),
   new Tuple2<>("aaa", 3)
  ),
  2);
 assertEquals(3, rdd1.getNumPartitions());
 assertEquals(2, rdd2.getNumPartitions());
 assertEquals(2, rdd3.getNumPartitions());
}

private static <U extends Geometry, T extends Geometry> void verifyPartitioningMatch(SpatialRDD<T> spatialRDD, SpatialRDD<U> queryRDD)
    throws Exception
{
  Objects.requireNonNull(spatialRDD.spatialPartitionedRDD, "[JoinQuery] spatialRDD SpatialPartitionedRDD is null. Please do spatial partitioning.");
  Objects.requireNonNull(queryRDD.spatialPartitionedRDD, "[JoinQuery] queryRDD SpatialPartitionedRDD is null. Please use the spatialRDD's grids to do spatial partitioning.");
  final SpatialPartitioner spatialPartitioner = spatialRDD.getPartitioner();
  final SpatialPartitioner queryPartitioner = queryRDD.getPartitioner();
  if (!queryPartitioner.equals(spatialPartitioner)) {
    throw new IllegalArgumentException("[JoinQuery] queryRDD is not partitioned by the same grids with spatialRDD. Please make sure they both use the same grids otherwise wrong results will appear.");
  }
  final int spatialNumPart = spatialRDD.spatialPartitionedRDD.getNumPartitions();
  final int queryNumPart = queryRDD.spatialPartitionedRDD.getNumPartitions();
  if (spatialNumPart != queryNumPart) {
    throw new IllegalArgumentException("[JoinQuery] numbers of partitions in queryRDD and spatialRDD don't match: " + queryNumPart + " vs. " + spatialNumPart + ". Please make sure they both use the same partitioning otherwise wrong results will appear.");
  }
}

private static <U extends Geometry, T extends Geometry> void verifyPartitioningMatch(SpatialRDD<T> spatialRDD, SpatialRDD<U> queryRDD)
    throws Exception
{
  Objects.requireNonNull(spatialRDD.spatialPartitionedRDD, "[JoinQuery] spatialRDD SpatialPartitionedRDD is null. Please do spatial partitioning.");
  Objects.requireNonNull(queryRDD.spatialPartitionedRDD, "[JoinQuery] queryRDD SpatialPartitionedRDD is null. Please use the spatialRDD's grids to do spatial partitioning.");
  final SpatialPartitioner spatialPartitioner = spatialRDD.getPartitioner();
  final SpatialPartitioner queryPartitioner = queryRDD.getPartitioner();
  if (!queryPartitioner.equals(spatialPartitioner)) {
    throw new IllegalArgumentException("[JoinQuery] queryRDD is not partitioned by the same grids with spatialRDD. Please make sure they both use the same grids otherwise wrong results will appear.");
  }
  final int spatialNumPart = spatialRDD.spatialPartitionedRDD.getNumPartitions();
  final int queryNumPart = queryRDD.spatialPartitionedRDD.getNumPartitions();
  if (spatialNumPart != queryNumPart) {
    throw new IllegalArgumentException("[JoinQuery] numbers of partitions in queryRDD and spatialRDD don't match: " + queryNumPart + " vs. " + spatialNumPart + ". Please make sure they both use the same partitioning otherwise wrong results will appear.");
  }
}

@Override
public SparkStream<T> shuffle(@NonNull Random random) {
 return new SparkStream<>(rdd.sortBy(t -> random.nextDouble(),
                   true,
                   rdd.getNumPartitions()
                   ));
}

@Test
public void testInputRepartitionColumnsAndPartitionCount() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(BatchStep.REPARTITION_COLUMNS_PROPERTY, Lists.newArrayList("modulo"));
 configMap.put(BatchStep.REPARTITION_NUM_PARTITIONS_PROPERTY, 5);
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10);
 Config config = ConfigFactory.parseMap(configMap);
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions();
 assertEquals(5, numPartitions);
}

@Test
public void testInputCoalesce() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10);
 configMap.put(BatchStep.COALESCE_NUM_PARTITIONS_PROPERTY, 5);
 Config config = ConfigFactory.parseMap(configMap);
 
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions(); 
 
 assertEquals(numPartitions, 5);
}

@Test
public void testInputRepartition() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 5);
 configMap.put(BatchStep.REPARTITION_NUM_PARTITIONS_PROPERTY, 10);
 Config config = ConfigFactory.parseMap(configMap);
 
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions(); 
 
 assertEquals(numPartitions, 10);
}

@Test
public void testInputRepartitionColumns() throws Exception {
 Map<String, Object> configMap = Maps.newHashMap();
 configMap.put(DataStep.INPUT_TYPE + "." + InputFactory.TYPE_CONFIG_NAME, DummyInput.class.getName());
 configMap.put(DataStep.INPUT_TYPE + "." + "starting.partitions", 10);
 configMap.put(BatchStep.REPARTITION_COLUMNS_PROPERTY, Lists.newArrayList("modulo"));
 Config config = ConfigFactory.parseMap(configMap);
 BatchStep batchStep = new BatchStep("test");
 batchStep.configure(config);
 batchStep.submit(Sets.<Step>newHashSet());
 Dataset<Row> df = batchStep.getData();
 int numPartitions = df.javaRDD().getNumPartitions();
 assertEquals(Contexts.getSparkSession().sqlContext().getConf("spark.sql.shuffle.partitions"),
   Integer.toString(numPartitions));
}

@Test
public void testsHBasePutAccessParallelism() {
 HoodieWriteConfig config = getConfig();
 HBaseIndex index = new HBaseIndex(config);
 final JavaRDD<WriteStatus> writeStatusRDD = jsc.parallelize(
   Arrays.asList(
     getSampleWriteStatus(1, 2),
     getSampleWriteStatus(0, 3),
     getSampleWriteStatus(10, 0)),
   10);
 final int hbasePutAccessParallelism = index.getHBasePutAccessParallelism(writeStatusRDD);
 Assert.assertEquals(10, writeStatusRDD.getNumPartitions());
 Assert.assertEquals(2, hbasePutAccessParallelism);
}

 /** An implementation of {@link Reshuffle} for the Spark runner. */
 public static <K, V> JavaRDD<WindowedValue<KV<K, V>>> reshuffle(
   JavaRDD<WindowedValue<KV<K, V>>> rdd, Coder<K> keyCoder, WindowedValueCoder<V> wvCoder) {

  // Use coders to convert objects in the PCollection to byte arrays, so they
  // can be transferred over the network for the shuffle.
  return rdd.map(new ReifyTimestampsAndWindowsFunction<>())
    .map(WindowingHelpers.unwindowFunction())
    .mapToPair(TranslationUtils.toPairFunction())
    .mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder))
    .repartition(rdd.getNumPartitions())
    .mapToPair(CoderHelpers.fromByteFunction(keyCoder, wvCoder))
    .map(TranslationUtils.fromPairFunction())
    .map(TranslationUtils.toKVByWindowInValue());
 }
}

final JavaRDD<SparkElement> right = (JavaRDD<SparkElement>) inputs.get(1);
final int numPartitions = right.getNumPartitions();

final Partitioner partitioner = new HashPartitioner(input.getNumPartitions());

How to use getNumPartitionsmethodin org.apache.spark.api.java.JavaRDD

Best Java code snippets using org.apache.spark.api.java.JavaRDD.getNumPartitions (Showing top 15 results out of 315)

How to use
getNumPartitions
method
in
org.apache.spark.api.java.JavaRDD