org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient java code examples

public static HiveSparkClient createHiveSparkClient(HiveConf hiveconf, String sparkSessionId,
                          String hiveSessionId) throws Exception {
 Map<String, String> sparkConf = initiateSparkConf(hiveconf, hiveSessionId);
 // Submit spark job through local spark context while spark master is local mode, otherwise submit
 // spark job through remote spark context.
 String master = sparkConf.get("spark.master");
 if (master.equals("local") || master.startsWith("local[")) {
  // With local spark context, all user sessions share the same spark context.
  return LocalHiveSparkClient.getInstance(generateSparkConf(sparkConf), hiveconf);
 } else {
  return new RemoteHiveSparkClient(hiveconf, sparkConf, sparkSessionId);
 }
}

@Override
public SparkJobRef execute(final DriverContext driverContext, final SparkWork sparkWork)
  throws Exception {
 if (SparkClientUtilities.isYarnMaster(hiveConf.get("spark.master")) &&
   !remoteClient.isActive()) {
  // Re-create the remote client if not active any more
  close();
  createRemoteClient();
 }
 try {
  return submit(driverContext, sparkWork);
 } catch (Throwable cause) {
  throw new Exception("Failed to submit Spark work, please retry later", cause);
 }
}

@Override
public int getExecutorCount() throws Exception {
 return getExecutorCount(sparkClientTimtout, TimeUnit.SECONDS);
}

private synchronized void refreshLocalResources(SparkWork sparkWork, HiveConf conf) throws IOException {
 // add hive-exec jar
 addJars((new JobConf(this.getClass())).getJar());
 // add aux jars
 addJars(conf.getAuxJars());
 addJars(SessionState.get() == null ? null : SessionState.get().getReloadableAuxJars());
 // add added jars
 String addedJars = Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDJARS, addedJars);
 addJars(addedJars);
 // add plugin module jars on demand
 // jobConf will hold all the configuration for hadoop, tez, and hive
 JobConf jobConf = new JobConf(conf);
 jobConf.set(MR_JAR_PROPERTY, "");
 for (BaseWork work : sparkWork.getAllWork()) {
  work.configureJobConf(jobConf);
 }
 addJars(jobConf.get(MR_JAR_PROPERTY));
 // remove the location of container tokens
 conf.unset(MR_CREDENTIALS_LOCATION_PROPERTY);
 // add added files
 String addedFiles = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDFILES, addedFiles);
 addResources(addedFiles);
 // add added archives
 String addedArchives = Utilities.getResourceFiles(conf, SessionState.ResourceType.ARCHIVE);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDARCHIVES, addedArchives);
 addResources(addedArchives);
}

private void createRemoteClient() throws Exception {
 remoteClient = SparkClientFactory.createClient(conf, hiveConf);
 if (HiveConf.getBoolVar(hiveConf, ConfVars.HIVE_PREWARM_ENABLED) &&
   SparkClientUtilities.isYarnMaster(hiveConf.get("spark.master"))) {
  int minExecutors = getExecutorsToWarm();
  if (minExecutors <= 0) {
   return;
  }
  LOG.info("Prewarm Spark executors. The minimum number of executors to warm is " + minExecutors);
  // Spend at most MAX_PREWARM_TIME to wait for executors to come up.
  int curExecutors = 0;
  long ts = System.currentTimeMillis();
  do {
   try {
    curExecutors = getExecutorCount(MAX_PREWARM_TIME, TimeUnit.MILLISECONDS);
   } catch (TimeoutException e) {
    // let's don't fail on future timeout since we have a timeout for pre-warm
    LOG.warn("Timed out getting executor count.", e);
   }
   if (curExecutors >= minExecutors) {
    LOG.info("Finished prewarming Spark executors. The current number of executors is " + curExecutors);
    return;
   }
   Thread.sleep(500); // sleep half a second
  } while (System.currentTimeMillis() - ts < MAX_PREWARM_TIME);
  LOG.info("Timeout (" + MAX_PREWARM_TIME / 1000 + "s) occurred while prewarming executors. " +
    "The current number of executors is " + curExecutors);
 }
}

private SparkJobRef submit(final DriverContext driverContext, final SparkWork sparkWork) throws Exception {
 final Context ctx = driverContext.getCtx();
 final HiveConf hiveConf = (HiveConf) ctx.getConf();
 refreshLocalResources(sparkWork, hiveConf);
 final JobConf jobConf = new JobConf(hiveConf);
 //update the credential provider location in the jobConf
 HiveConfUtil.updateJobCredentialProviders(jobConf);
 // Create temporary scratch dir
 final Path emptyScratchDir = ctx.getMRTmpPath();
 FileSystem fs = emptyScratchDir.getFileSystem(jobConf);
 fs.mkdirs(emptyScratchDir);
 // make sure NullScanFileSystem can be loaded - HIVE-18442
 jobConf.set("fs." + NullScanFileSystem.getBaseScheme() + ".impl",
   NullScanFileSystem.class.getCanonicalName());
 byte[] jobConfBytes = KryoSerializer.serializeJobConf(jobConf);
 byte[] scratchDirBytes = KryoSerializer.serialize(emptyScratchDir);
 byte[] sparkWorkBytes = KryoSerializer.serialize(sparkWork);
 JobStatusJob job = new JobStatusJob(jobConfBytes, scratchDirBytes, sparkWorkBytes);
 if (driverContext.isShutdown()) {
  throw new HiveException("Operation is cancelled.");
 }
 JobHandle<Serializable> jobHandle = remoteClient.submit(job);
 RemoteSparkJobStatus sparkJobStatus = new RemoteSparkJobStatus(remoteClient, jobHandle, sparkClientTimtout);
 return new RemoteSparkJobRef(hiveConf, jobHandle, sparkJobStatus);
}

RemoteHiveSparkClient(HiveConf hiveConf, Map<String, String> conf, String sessionId) throws Exception {
 this.hiveConf = hiveConf;
 sparkClientTimtout = hiveConf.getTimeVar(HiveConf.ConfVars.SPARK_CLIENT_FUTURE_TIMEOUT,
   TimeUnit.SECONDS);
 sparkConf = HiveSparkClientFactory.generateSparkConf(conf);
 this.conf = conf;
 this.sessionId = sessionId;
 createRemoteClient();
}

private void refreshLocalResources(SparkWork sparkWork, HiveConf conf) throws IOException {
 // add hive-exec jar
 addJars((new JobConf(this.getClass())).getJar());
 // add aux jars
 addJars(conf.getAuxJars());
 addJars(SessionState.get() == null ? null : SessionState.get().getReloadableAuxJars());
 // add added jars
 String addedJars = Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDJARS, addedJars);
 addJars(addedJars);
 // add plugin module jars on demand
 // jobConf will hold all the configuration for hadoop, tez, and hive
 JobConf jobConf = new JobConf(conf);
 jobConf.set(MR_JAR_PROPERTY, "");
 for (BaseWork work : sparkWork.getAllWork()) {
  work.configureJobConf(jobConf);
 }
 addJars(conf.get(MR_JAR_PROPERTY));
 // add added files
 String addedFiles = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDFILES, addedFiles);
 addResources(addedFiles);
 // add added archives
 String addedArchives = Utilities.getResourceFiles(conf, SessionState.ResourceType.ARCHIVE);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDARCHIVES, addedArchives);
 addResources(addedArchives);
}

   (SparkClientUtilities.isYarnMaster(hiveConf.get("spark.master")) ||
    SparkClientUtilities.isLocalMaster(hiveConf.get("spark.master")))) {
int minExecutors = getExecutorsToWarm();
if (minExecutors <= 0) {
 return;
do {
 try {
  curExecutors = getExecutorCount(maxPrewarmTime, TimeUnit.MILLISECONDS);
 } catch (TimeoutException e) {

private SparkJobRef submit(final DriverContext driverContext, final SparkWork sparkWork) throws Exception {
 final Context ctx = driverContext.getCtx();
 final HiveConf hiveConf = (HiveConf) ctx.getConf();
 refreshLocalResources(sparkWork, hiveConf);
 final JobConf jobConf = new JobConf(hiveConf);
 //update the credential provider location in the jobConf
 HiveConfUtil.updateJobCredentialProviders(jobConf);
 // Create temporary scratch dir
 final Path emptyScratchDir = ctx.getMRTmpPath();
 FileSystem fs = emptyScratchDir.getFileSystem(jobConf);
 fs.mkdirs(emptyScratchDir);
 byte[] jobConfBytes = KryoSerializer.serializeJobConf(jobConf);
 byte[] scratchDirBytes = KryoSerializer.serialize(emptyScratchDir);
 byte[] sparkWorkBytes = KryoSerializer.serialize(sparkWork);
 JobStatusJob job = new JobStatusJob(jobConfBytes, scratchDirBytes, sparkWorkBytes);
 if (driverContext.isShutdown()) {
  throw new HiveException("Operation is cancelled.");
 }
 JobHandle<Serializable> jobHandle = remoteClient.submit(job);
 RemoteSparkJobStatus sparkJobStatus = new RemoteSparkJobStatus(remoteClient, jobHandle, sparkClientTimtout);
 return new RemoteSparkJobRef(hiveConf, jobHandle, sparkJobStatus);
}

RemoteHiveSparkClient(HiveConf hiveConf, Map<String, String> conf) throws Exception {
 this.hiveConf = hiveConf;
 sparkClientTimtout = hiveConf.getTimeVar(HiveConf.ConfVars.SPARK_CLIENT_FUTURE_TIMEOUT,
   TimeUnit.SECONDS);
 sparkConf = HiveSparkClientFactory.generateSparkConf(conf);
 this.conf = conf;
 createRemoteClient();
}

private void refreshLocalResources(SparkWork sparkWork, HiveConf conf) throws IOException {
 // add hive-exec jar
 addJars((new JobConf(this.getClass())).getJar());
 // add aux jars
 addJars(HiveConf.getVar(conf, HiveConf.ConfVars.HIVEAUXJARS));
 // add added jars
 String addedJars = Utilities.getResourceFiles(conf, SessionState.ResourceType.JAR);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDJARS, addedJars);
 addJars(addedJars);
 // add plugin module jars on demand
 // jobConf will hold all the configuration for hadoop, tez, and hive
 JobConf jobConf = new JobConf(conf);
 jobConf.set(MR_JAR_PROPERTY, "");
 for (BaseWork work : sparkWork.getAllWork()) {
  work.configureJobConf(jobConf);
 }
 addJars(conf.get(MR_JAR_PROPERTY));
 // add added files
 String addedFiles = Utilities.getResourceFiles(conf, SessionState.ResourceType.FILE);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDFILES, addedFiles);
 addResources(addedFiles);
 // add added archives
 String addedArchives = Utilities.getResourceFiles(conf, SessionState.ResourceType.ARCHIVE);
 HiveConf.setVar(conf, HiveConf.ConfVars.HIVEADDEDARCHIVES, addedArchives);
 addResources(addedArchives);
}

@Override
public SparkJobRef execute(final DriverContext driverContext, final SparkWork sparkWork)
  throws Exception {
 if (SparkClientUtilities.isYarnMaster(hiveConf.get("spark.master")) &&
   !remoteClient.isActive()) {
  // Re-create the remote client if not active any more
  close();
  createRemoteClient();
 }
 try {
  return submit(driverContext, sparkWork);
 } catch (Throwable cause) {
  throw new Exception("Failed to submit Spark work, please retry later", cause);
 }
}

public static HiveSparkClient createHiveSparkClient(HiveConf hiveconf) throws Exception {
 Map<String, String> sparkConf = initiateSparkConf(hiveconf);
 // Submit spark job through local spark context while spark master is local mode, otherwise submit
 // spark job through remote spark context.
 String master = sparkConf.get("spark.master");
 if (master.equals("local") || master.startsWith("local[")) {
  // With local spark context, all user sessions share the same spark context.
  return LocalHiveSparkClient.getInstance(generateSparkConf(sparkConf));
 } else {
  return new RemoteHiveSparkClient(hiveconf, sparkConf);
 }
}

@Override
public SparkJobRef execute(final DriverContext driverContext, final SparkWork sparkWork) throws Exception {
 final Context ctx = driverContext.getCtx();
 final HiveConf hiveConf = (HiveConf) ctx.getConf();
 refreshLocalResources(sparkWork, hiveConf);
 final JobConf jobConf = new JobConf(hiveConf);
 // Create temporary scratch dir
 final Path emptyScratchDir = ctx.getMRTmpPath();
 FileSystem fs = emptyScratchDir.getFileSystem(jobConf);
 fs.mkdirs(emptyScratchDir);
 byte[] jobConfBytes = KryoSerializer.serializeJobConf(jobConf);
 byte[] scratchDirBytes = KryoSerializer.serialize(emptyScratchDir);
 byte[] sparkWorkBytes = KryoSerializer.serialize(sparkWork);
 JobStatusJob job = new JobStatusJob(jobConfBytes, scratchDirBytes, sparkWorkBytes);
 JobHandle<Serializable> jobHandle = remoteClient.submit(job);
 RemoteSparkJobStatus sparkJobStatus = new RemoteSparkJobStatus(remoteClient, jobHandle, sparkClientTimtout);
 return new RemoteSparkJobRef(hiveConf, jobHandle, sparkJobStatus);
}

@Override
public int getExecutorCount() throws Exception {
 return getExecutorCount(sparkClientTimtout, TimeUnit.SECONDS);
}

public static HiveSparkClient createHiveSparkClient(HiveConf hiveconf)
 throws IOException, SparkException {
 Map<String, String> sparkConf = initiateSparkConf(hiveconf);
 // Submit spark job through local spark context while spark master is local mode, otherwise submit
 // spark job through remote spark context.
 String master = sparkConf.get("spark.master");
 if (master.equals("local") || master.startsWith("local[")) {
  // With local spark context, all user sessions share the same spark context.
  return LocalHiveSparkClient.getInstance(generateSparkConf(sparkConf));
 } else {
  return new RemoteHiveSparkClient(hiveconf, sparkConf);
 }
}

Javadoc

RemoteSparkClient is a wrapper of org.apache.hive.spark.client.SparkClient, which wrap a spark job request and send to an remote SparkContext.

Most used methods

<init>
addJars
addResources
refreshLocalResources
close
createRemoteClient
getExecutorCount
getExecutorsToWarm
Please note that the method is very tied with Spark documentation 1.4.1 regarding dynamic allocation
submit

Popular in Java

Creating JSON documents from java classes using gson
getExternalFilesDir (Context)
runOnUiThread (Activity)
setContentView (Activity)
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
From CI to AI: The AI layer in your organization

How to useRemoteHiveSparkClient in org.apache.hadoop.hive.ql.exec.spark

Best Java code snippets using org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient (Showing top 17 results out of 315)

How to use
RemoteHiveSparkClient
in
org.apache.hadoop.hive.ql.exec.spark