private void updateSparkContext(@NonNull final SparkArgs sparkArgs, @NonNull final SparkContext sc) { for (SparkListener sparkListener : getSparkEventListeners()) { sc.addSparkListener(sparkListener); } sc.hadoopConfiguration().addResource(sparkArgs.getHadoopConfiguration()); }
@Test public void testSparkSessionAndSparkContext() { final SparkArgs sparkArgs = getSampleMarmaraySparkArgs(); final SparkSession sparkSession = sparkFactory.get() .getSparkSession(sparkArgs, false); assertExpectationsOnSparkContext(sparkArgs, sparkSession.sparkContext()); // should re-use existing SparkContext and not fail final SparkContext sc2 = sparkFactory.get().getSparkContext(sparkArgs).sc(); assertExpectationsOnSparkContext(sparkArgs, sc2); }
/** * Uses {@link SparkSession} returned from {@link SparkFactory#getSparkSession} * to create {@link JavaSparkContext}. See {@link SparkFactory#getSparkSession} * for {@link SparkSession} that is retrieved. */ public synchronized JavaSparkContext getSparkContext( @NonNull final SparkArgs sparkArgs) { return new JavaSparkContext(getSparkSession(sparkArgs, false).sparkContext()); }
/** * Creates {@link SparkConf} with {@link org.apache.spark.serializer.KryoSerializer} along with * registering default/user-input serializable classes and user-input Avro Schemas. * Once {@link SparkContext} is created, we can no longer register serialization classes and Avro schemas. */ public SparkConf createSparkConf(@NonNull final SparkArgs sparkArgs) { /** * By custom registering classes the full class name of each object * is not stored during serialization which reduces storage space. */ final SparkConf sparkConf = new SparkConf(); sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); final List<Class> serializableClasses = getDefaultSerializableClasses(); serializableClasses.addAll(sparkArgs.getUserSerializationClasses()); sparkConf.registerKryoClasses(serializableClasses.toArray(new Class[0])); if (sparkArgs.getAvroSchemas().isPresent()) { sparkConf.registerAvroSchemas( JavaConverters .iterableAsScalaIterableConverter(sparkArgs.getAvroSchemas().get()) .asScala() .toSeq()); } // override spark properties final Map<String, String> sparkProps = sparkArgs.getOverrideSparkProperties(); for (Entry<String, String> entry : sparkProps.entrySet()) { log.info("Setting spark key:val {} : {}", entry.getKey(), entry.getValue()); sparkConf.set(entry.getKey(), entry.getValue()); } return sparkConf; }
@After public void tearDown() { final SparkArgs sparkArgs = getSampleMarmaraySparkArgs(); // gets existing sc this.sparkFactory.get().getSparkContext(sparkArgs).sc().stop(); this.sparkFactory = Optional.absent(); }
/** * Uses existing {@link SparkSession} if present, else creates a new one */ public synchronized SparkSession getSparkSession( @NonNull final SparkArgs sparkArgs, final boolean enableHiveSupport) { if (this.sparkSession.isPresent()) { return sparkSession.get(); } final Builder sparkSessionBuilder = SparkSession.builder(); if (enableHiveSupport) { sparkSessionBuilder.enableHiveSupport(); } this.sparkSession = Optional.of(sparkSessionBuilder .config(createSparkConf(sparkArgs)).getOrCreate()); log.info("Created new SparkSession using {}", sparkArgs); updateSparkContext(sparkArgs, this.sparkSession.get().sparkContext()); return this.sparkSession.get(); }
private void assertExpectationsOnSparkContext( @NonNull final SparkArgs sparkArgs, @NonNull final SparkContext sc) { final String registeredAvroSchemaStr = sc.conf().getAvroSchema().head()._2(); final Schema expectedAvroSchema = sparkArgs.getAvroSchemas().get().get(0); Assert.assertEquals(expectedAvroSchema.toString(), registeredAvroSchemaStr); Assert.assertEquals("foo_bar", sc.appName()); Assert.assertEquals("512", sc.hadoopConfiguration().get("mapreduce.map.memory.mb")); }
@Before public void setup() { // Since we are caching sparkContext in it. this.sparkFactory = Optional.of(new SparkFactory()); }
private SparkArgs getSampleMarmaraySparkArgs() { final Schema recordSchema = SchemaBuilder.record("fooRecord").fields().name("abc").type() .intType().intDefault(0).endRecord(); final Optional<List<Schema>> schemas = Optional.of(Arrays.asList(recordSchema)); final Map<String, String> overrideSparkProperties = new HashMap<>(); overrideSparkProperties.put("spark.master", "local[2]"); overrideSparkProperties.put("spark.app.name", "foo_bar"); final Configuration hadoopConfiguration = new Configuration(); hadoopConfiguration.set("mapreduce.map.memory.mb", "512"); return new SparkArgs(schemas, Arrays.asList(), overrideSparkProperties, hadoopConfiguration); } }