static org.apache.spark.serializer.KryoSerializer getInstance(JavaSparkContext sc, Configuration conf) { if (INSTANCE == null) { synchronized (ShuffleKryoSerializer.class) { if (INSTANCE == null) { try { INSTANCE = (org.apache.spark.serializer.KryoSerializer) Thread.currentThread().getContextClassLoader().loadClass( HIVE_SHUFFLE_KRYO_SERIALIZER).getConstructor(SparkConf.class).newInstance( sc.getConf()); return INSTANCE; } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | ClassNotFoundException e) { throw new IllegalStateException( "Unable to create kryo serializer for shuffle RDDs using " + "class " + HIVE_SHUFFLE_KRYO_SERIALIZER, e); } } else { return INSTANCE; } } } return INSTANCE; } }
/** * Implements Spark with Alluxio integration checker. * * @param sc current JavaSparkContext * @param reportWriter save user-facing messages to a generated file * @return performIntegrationChecks results */ private Status run(JavaSparkContext sc, PrintWriter reportWriter, AlluxioConfiguration conf) { // Check whether Spark driver can recognize Alluxio classes and filesystem Status driverStatus = CheckerUtils.performIntegrationChecks(); String driverAddress = sc.getConf().get("spark.driver.host"); switch (driverStatus) { case FAIL_TO_FIND_CLASS: reportWriter.printf("Spark driver: %s failed to recognize Alluxio classes.%n%n", driverAddress); return driverStatus; case FAIL_TO_FIND_FS: reportWriter.printf("Spark driver: %s failed to recognize Alluxio filesystem.%n%n", driverAddress); return driverStatus; default: reportWriter.printf("Spark driver: %s can recognize Alluxio filesystem.%n%n", driverAddress); break; } if (!CheckerUtils.supportAlluxioHA(reportWriter, conf)) { return Status.FAIL_TO_SUPPORT_HA; } return runSparkJob(sc, reportWriter); }
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }
SparkConf sparkConf = context.getConf();
SparkConf sparkConf = context.getConf();
SparkConf sparkConf = context.getConf();
private static Configuration getHadoopConfigurationIfYarnMode(final JavaSparkContext sparkContext) { final String sparkMaster = sparkContext.getConf().get("spark.master"); if (Strings.isNullOrEmpty(sparkMaster) || "local".equals(sparkMaster)) { return null; } return sparkContext.hadoopConfiguration(); }
public static JavaRDD<String> reduceJSON(JavaSparkContext jsc, JavaRDD<String> input, final Properties karmaSettings) { return reduceJSON(jsc, input, jsc.getConf().getInt("spark.default.parallelism", 1), karmaSettings); }
public static JavaPairRDD<String, String> reduceJSON(JavaSparkContext sc, JavaPairRDD<String, String> input, final Properties karmaSettings) { return reduceJSON(sc, input, sc.getConf().getInt("spark.default.parallelism", 1), karmaSettings); } public static JavaPairRDD<String, String> reduceJSON(JavaSparkContext sc,
private static void doCompactValidate(JavaSparkContext jsc, String basePath, String compactionInstant, String outputPath, int parallelism, String sparkMaster, String sparkMemory) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.VALIDATE; cfg.outputPath = outputPath; cfg.compactionInstantTime = compactionInstant; cfg.parallelism = parallelism; if ((null != sparkMaster) && (!sparkMaster.isEmpty())) { jsc.getConf().setMaster(sparkMaster); } jsc.getConf().set("spark.executor.memory", sparkMemory); new HoodieCompactionAdminTool(cfg).run(jsc); }
private static void doCompactRepair(JavaSparkContext jsc, String basePath, String compactionInstant, String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean dryRun) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.REPAIR; cfg.outputPath = outputPath; cfg.compactionInstantTime = compactionInstant; cfg.parallelism = parallelism; cfg.dryRun = dryRun; if ((null != sparkMaster) && (!sparkMaster.isEmpty())) { jsc.getConf().setMaster(sparkMaster); } jsc.getConf().set("spark.executor.memory", sparkMemory); new HoodieCompactionAdminTool(cfg).run(jsc); }
private static void doCompactUnschedule(JavaSparkContext jsc, String basePath, String compactionInstant, String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, boolean dryRun) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.UNSCHEDULE_PLAN; cfg.outputPath = outputPath; cfg.compactionInstantTime = compactionInstant; cfg.parallelism = parallelism; cfg.dryRun = dryRun; cfg.skipValidation = skipValidation; if ((null != sparkMaster) && (!sparkMaster.isEmpty())) { jsc.getConf().setMaster(sparkMaster); } jsc.getConf().set("spark.executor.memory", sparkMemory); new HoodieCompactionAdminTool(cfg).run(jsc); }
private static void doCompactUnscheduleFile(JavaSparkContext jsc, String basePath, String fileId, String outputPath, int parallelism, String sparkMaster, String sparkMemory, boolean skipValidation, boolean dryRun) throws Exception { HoodieCompactionAdminTool.Config cfg = new HoodieCompactionAdminTool.Config(); cfg.basePath = basePath; cfg.operation = Operation.UNSCHEDULE_FILE; cfg.outputPath = outputPath; cfg.fileId = fileId; cfg.parallelism = parallelism; cfg.dryRun = dryRun; cfg.skipValidation = skipValidation; if ((null != sparkMaster) && (!sparkMaster.isEmpty())) { jsc.getConf().setMaster(sparkMaster); } jsc.getConf().set("spark.executor.memory", sparkMemory); new HoodieCompactionAdminTool(cfg).run(jsc); }
private static int compact(JavaSparkContext jsc, String basePath, String tableName, String compactionInstant, int parallelism, String schemaFile, String sparkMemory, int retry, boolean schedule) throws Exception { HoodieCompactor.Config cfg = new HoodieCompactor.Config(); cfg.basePath = basePath; cfg.tableName = tableName; cfg.compactionInstantTime = compactionInstant; // TODO: Make this configurable along with strategy specific config - For now, this is a generic enough strategy cfg.strategyClassName = UnBoundedCompactionStrategy.class.getCanonicalName(); cfg.parallelism = parallelism; cfg.schemaFile = schemaFile; cfg.runSchedule = schedule; jsc.getConf().set("spark.executor.memory", sparkMemory); return new HoodieCompactor(cfg).compact(jsc, retry); }
protected String getAppId() { return jsc.get().getConf().getAppId(); } }
private static int dataLoad(JavaSparkContext jsc, String command, String srcPath, String targetPath, String tableName, String tableType, String rowKey, String partitionKey, int parallelism, String schemaFile, String sparkMaster, String sparkMemory, int retry) throws Exception { HDFSParquetImporter.Config cfg = new HDFSParquetImporter.Config(); cfg.command = command; cfg.srcPath = srcPath; cfg.targetPath = targetPath; cfg.tableName = tableName; cfg.tableType = tableType; cfg.rowKey = rowKey; cfg.partitionKey = partitionKey; cfg.parallelism = parallelism; cfg.schemaFile = schemaFile; jsc.getConf().set("spark.executor.memory", sparkMemory); return new HDFSParquetImporter(cfg).dataImport(jsc, retry); }
public SparkExecutor(SparkPlatform platform, Job job) { super(job); this.platform = platform; this.sparkContextReference = this.platform.getSparkContext(job); this.sparkContextReference.noteObtainedReference(); this.sc = this.sparkContextReference.get(); if (this.sc.getConf().contains("spark.executor.cores")) { this.numDefaultPartitions = 2 * this.sc.getConf().getInt("spark.executor.cores", -1); } else { this.numDefaultPartitions = (int) (2 * this.getConfiguration().getLongProperty("rheem.spark.machines") * this.getConfiguration().getLongProperty("rheem.spark.cores-per-machine")); } }
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD, final JavaSparkContext jsc) { if (config.getHbaseIndexPutBatchSizeAutoCompute()) { SparkConf conf = jsc.getConf(); int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1); if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) { maxExecutors = Math.max(maxExecutors, conf.getInt( DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1)); } /* Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for inserts from HBaseIndex. */ int hbasePutAccessParallelism = getHBasePutAccessParallelism(writeStatusRDD); multiPutBatchSize = putBatchSizeCalculator .getBatchSize( getNumRegionServersAliveForTable(), maxQpsPerRegionServer, hbasePutAccessParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, qpsFraction); } }
private void setPutBatchSize(JavaRDD<WriteStatus> writeStatusRDD, final JavaSparkContext jsc) { if (config.getHbaseIndexPutBatchSizeAutoCompute()) { SparkConf conf = jsc.getConf(); int maxExecutors = conf.getInt(DEFAULT_SPARK_EXECUTOR_INSTANCES_CONFIG_NAME, 1); if (conf.getBoolean(DEFAULT_SPARK_DYNAMIC_ALLOCATION_ENABLED_CONFIG_NAME, false)) { maxExecutors = Math.max(maxExecutors, conf.getInt( DEFAULT_SPARK_DYNAMIC_ALLOCATION_MAX_EXECUTORS_CONFIG_NAME, 1)); } /* Each writeStatus represents status information from a write done in one of the IOHandles. If a writeStatus has any insert, it implies that the corresponding task contacts HBase for doing puts, since we only do puts for inserts from HBaseIndex. */ int hbasePutAccessParallelism = getHBasePutAccessParallelism(writeStatusRDD); multiPutBatchSize = putBatchSizeCalculator .getBatchSize( getNumRegionServersAliveForTable(), maxQpsPerRegionServer, hbasePutAccessParallelism, maxExecutors, SLEEP_TIME_MILLISECONDS, qpsFraction); } }
private static JavaRDD<String[]> getOtherFormatHiveInput(JavaSparkContext sc, String hiveTable) { SparkSession sparkSession = SparkSession.builder().config(sc.getConf()).enableHiveSupport().getOrCreate(); final Dataset intermediateTable = sparkSession.table(hiveTable); return intermediateTable.javaRDD().map(new Function<Row, String[]>() { @Override public String[] call(Row row) throws Exception { String[] result = new String[row.size()]; for (int i = 0; i < row.size(); i++) { final Object o = row.get(i); if (o != null) { result[i] = o.toString(); } else { result[i] = null; } } return result; } }); }