/** * @return True if region is online and scannable else false if an error or shutdown (Otherwise * we just block in here holding up all forward-progess). */ private boolean isRegionOnline(RegionInfo ri) throws InterruptedException { RetryCounter rc = null; while (!isStopped()) { RegionState rs = this.assignmentManager.getRegionStates().getRegionState(ri); if (rs.isOpened()) { if (this.getServerManager().isServerOnline(rs.getServerName())) { return true; } } // Region is not OPEN. Optional<Procedure<MasterProcedureEnv>> optProc = this.procedureExecutor.getProcedures(). stream().filter(p -> p instanceof ServerCrashProcedure).findAny(); // TODO: Add a page to refguide on how to do repair. Have this log message point to it. // Page will talk about loss of edits, how to schedule at least the meta WAL recovery, and // then how to assign including how to break region lock if one held. LOG.warn("{} is NOT online; state={}; ServerCrashProcedures={}. Master startup cannot " + "progress, in holding-pattern until region onlined.", ri.getRegionNameAsString(), rs, optProc.isPresent()); // Check once-a-minute. if (rc == null) { rc = new RetryCounterFactory(1000).create(); } Threads.sleep(rc.getBackoffTimeAndIncrementAttempts()); } return false; }
private void unlockHbck() { if (isExclusive() && hbckLockCleanup.compareAndSet(true, false)) { RetryCounter retryCounter = lockFileRetryCounterFactory.create(); do { try { IOUtils.closeQuietly(hbckOutFd); FSUtils.delete(FSUtils.getCurrentFileSystem(getConf()), HBCK_LOCK_PATH, true); LOG.info("Finishing hbck"); return; } catch (IOException ioe) { LOG.info("Failed to delete " + HBCK_LOCK_PATH + ", try=" + (retryCounter.getAttemptTimes() + 1) + " of " + retryCounter.getMaxAttempts()); LOG.debug("Failed to delete " + HBCK_LOCK_PATH, ioe); try { retryCounter.sleepUntilNextRetry(); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); LOG.warn("Interrupted while deleting lock file" + HBCK_LOCK_PATH); return; } } } while (retryCounter.shouldRetry()); } }
/** * @return A retry counter factory configured for retrying znode creation. */ private static RetryCounterFactory createZnodeRetryCounterFactory(Configuration conf) { return new RetryCounterFactory( conf.getInt("hbase.hbck.createznode.attempts", DEFAULT_MAX_CREATE_ZNODE_ATTEMPTS), conf.getInt("hbase.hbck.createznode.attempt.sleep.interval", DEFAULT_CREATE_ZNODE_ATTEMPT_SLEEP_INTERVAL), conf.getInt("hbase.hbck.createznode.attempt.maxsleeptime", DEFAULT_CREATE_ZNODE_ATTEMPT_MAX_SLEEP_TIME)); }
checkAndMarkRunningHbck(getConf(), this.lockFileRetryCounterFactory.create()); HBCK_LOCK_PATH = pair.getFirst(); this.hbckOutFd = pair.getSecond();
/** * @return A retry counter factory configured for retrying lock file creation. */ public static RetryCounterFactory createLockRetryCounterFactory(Configuration conf) { return new RetryCounterFactory( conf.getInt("hbase.hbck.lockfile.attempts", DEFAULT_MAX_LOCK_FILE_ATTEMPTS), conf.getInt("hbase.hbck.lockfile.attempt.sleep.interval", DEFAULT_LOCK_FILE_ATTEMPT_SLEEP_INTERVAL), conf.getInt("hbase.hbck.lockfile.attempt.maxsleeptime", DEFAULT_LOCK_FILE_ATTEMPT_MAX_SLEEP_TIME)); }
RetryCounter retryCounter = createZNodeRetryCounterFactory.create(); hbckEphemeralNodePath = ZNodePaths.joinZNode( zkw.getZNodePaths().masterMaintZNode,
/** * Get an exponential backoff retry counter. The base unit is 100 milliseconds, and the max * backoff time is 30 seconds. */ public static RetryCounter getRetryCounter() { return new RetryCounterFactory( new RetryCounter.RetryConfig().setBackoffPolicy(new RetryCounter.ExponentialBackoffPolicy()) .setSleepInterval(100).setMaxSleepTime(30000).setMaxAttempts(Integer.MAX_VALUE) .setTimeUnit(TimeUnit.MILLISECONDS).setJitter(0.01f)).create(); } }
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="DE_MIGHT_IGNORE", justification="None. Its always been this way.") public RecoverableZooKeeper(String quorumServers, int sessionTimeout, Watcher watcher, int maxRetries, int retryIntervalMillis, int maxSleepTime, String identifier) throws IOException { // TODO: Add support for zk 'chroot'; we don't add it to the quorumServers String as we should. this.retryCounterFactory = new RetryCounterFactory(maxRetries+1, retryIntervalMillis, maxSleepTime); if (identifier == null || identifier.length() == 0) { // the identifier = processID@hostName identifier = ManagementFactory.getRuntimeMXBean().getName(); } LOG.info("Process identifier=" + identifier + " connecting to ZooKeeper ensemble=" + quorumServers); this.identifier = identifier; this.id = Bytes.toBytes(identifier); this.watcher = watcher; this.sessionTimeout = sessionTimeout; this.quorumServers = quorumServers; try { checkZk(); } catch (Exception x) { /* ignore */ } }
private Pair<Integer, String> execWithRetries(String hostname, ServiceType service, String... cmd) throws IOException { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { return exec(hostname, service, cmd); } catch (IOException e) { retryOrThrow(retryCounter, e, hostname, cmd); } try { retryCounter.sleepUntilNextRetry(); } catch (InterruptedException ex) { // ignore LOG.warn("Sleep Interrupted:" + ex); } } }
@Test public void testBasics() throws InterruptedException { int maxAttempts = 10; RetryCounterFactory factory = new RetryCounterFactory(maxAttempts, 10, 1000); RetryCounter retryCounter = factory.create(); while (retryCounter.shouldRetry()) { LOG.info("Attempt={}, backoffTime={}", retryCounter.getAttemptTimes(), retryCounter.getBackoffTime()); retryCounter.sleepUntilNextRetry(); } assertTrue(retryCounter.getAttemptTimes() == maxAttempts); } }
@Override public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) { // Configured gets passed null before real conf. Why? I don't know. return; } sshUserName = conf.get("hbase.it.clustermanager.ssh.user", ""); String extraSshOptions = conf.get("hbase.it.clustermanager.ssh.opts", ""); sshOptions = System.getenv("HBASE_SSH_OPTS"); if (!extraSshOptions.isEmpty()) { sshOptions = StringUtils.join(new Object[] { sshOptions, extraSshOptions }, " "); } sshOptions = (sshOptions == null) ? "" : sshOptions; sshUserName = (sshUserName == null) ? "" : sshUserName; tunnelCmd = conf.get("hbase.it.clustermanager.ssh.cmd", DEFAULT_TUNNEL_CMD); // Print out ssh special config if any. if ((sshUserName != null && sshUserName.length() > 0) || (sshOptions != null && sshOptions.length() > 0)) { LOG.info("Running with SSH user [" + sshUserName + "] and options [" + sshOptions + "]"); } this.retryCounterFactory = new RetryCounterFactory(new RetryConfig() .setMaxAttempts(conf.getInt(RETRY_ATTEMPTS_KEY, DEFAULT_RETRY_ATTEMPTS)) .setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL))); }
private String createNonSequential(String path, byte[] data, List<ACL> acl, CreateMode createMode) throws KeeperException, InterruptedException { RetryCounter retryCounter = retryCounterFactory.create(); boolean isRetry = false; // False for first attempt, true for all retries. long startTime;
RetryCounter counter = new RetryCounterFactory(maxAttempts, (int)pause).create();
public RecoverableZooKeeper(String quorumServers, int sessionTimeout, Watcher watcher, int maxRetries, int retryIntervalMillis) throws IOException { this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher); this.retryCounterFactory = new RetryCounterFactory(maxRetries, retryIntervalMillis); // the identifier = processID@hostName this.identifier = ManagementFactory.getRuntimeMXBean().getName(); LOG.info("The identifier of this process is " + identifier); this.id = Bytes.toBytes(identifier); this.watcher = watcher; this.sessionTimeout = sessionTimeout; this.quorumServers = quorumServers; }
RetryCounter retryCounter = retryCounterFactory.create(); boolean isRetry = false; // False for first attempt, true for all retries. while (true) {
RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5); RetryCounter rc = rcf.create(); while (keepLooping()) { RegionServerStartupResponse w = reportForDuty();
public RecoverableZooKeeper(String quorumServers, int sessionTimeout, Watcher watcher, int maxRetries, int retryIntervalMillis) throws IOException { this.zk = new ZooKeeper(quorumServers, sessionTimeout, watcher); this.retryCounterFactory = new RetryCounterFactory(maxRetries, retryIntervalMillis); // the identifier = processID@hostName this.identifier = ManagementFactory.getRuntimeMXBean().getName(); LOG.info("The identifier of this process is " + identifier); this.id = Bytes.toBytes(identifier); this.watcher = watcher; this.sessionTimeout = sessionTimeout; this.quorumServers = quorumServers; salter = new SecureRandom(); }
/** * getChildren is an idempotent operation. Retry before throwing exception * @return List of children znodes */ public List<String> getChildren(String path, Watcher watcher) throws KeeperException, InterruptedException { try (TraceScope scope = TraceUtil.createTrace("RecoverableZookeeper.getChildren")) { RetryCounter retryCounter = retryCounterFactory.create(); while (true) { try { long startTime = EnvironmentEdgeManager.currentTime(); List<String> children = checkZk().getChildren(path, watcher); return children; } catch (KeeperException e) { switch (e.code()) { case CONNECTIONLOSS: retryOrThrow(retryCounter, e, "getChildren"); break; case OPERATIONTIMEOUT: retryOrThrow(retryCounter, e, "getChildren"); break; default: throw e; } } retryCounter.sleepUntilNextRetry(); } } }
@Test public void testBasics() throws InterruptedException { int maxAttempts = 10; RetryCounterFactory factory = new RetryCounterFactory(maxAttempts, 10, 1000); RetryCounter retryCounter = factory.create(); while (retryCounter.shouldRetry()) { LOG.info("Attempt={}, backoffTime={}", retryCounter.getAttemptTimes(), retryCounter.getBackoffTime()); retryCounter.sleepUntilNextRetry(); } assertTrue(retryCounter.getAttemptTimes() == maxAttempts); } }
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DE_MIGHT_IGNORE", justification = "None. Its always been this way.") public RecoverableZooKeeper2(String quorumServers, int sessionTimeout, Watcher watcher, int maxRetries, int retryIntervalMillis, String identifier) throws IOException { // TODO: Add support for zk 'chroot'; we don't add it to the quorumServers String as we should. this.retryCounterFactory = new RetryCounterFactory(maxRetries + 1, retryIntervalMillis); if (identifier == null || identifier.length() == 0) { // the identifier = processID@hostName identifier = ManagementFactory.getRuntimeMXBean().getName(); } LOG.info("Process identifier=" + identifier + " connecting to ZooKeeper ensemble=" + quorumServers); this.identifier = identifier; this.id = Bytes.toBytes(identifier); this.watcher = watcher; this.sessionTimeout = sessionTimeout; this.quorumServers = quorumServers; try { checkZk(); } catch (Exception x) {/* ignore */} salter = new Random(); }