private void initZK() throws HadoopIllegalArgumentException, IOException, KeeperException { zkQuorum = conf.get(ZK_QUORUM_KEY); int zkTimeout = conf.getInt(ZK_SESSION_TIMEOUT_KEY, ZK_SESSION_TIMEOUT_DEFAULT); // Parse ACLs from configuration. String zkAclConf = conf.get(ZK_ACL_KEY, ZK_ACL_DEFAULT); zkAclConf = ZKUtil.resolveConfIndirection(zkAclConf); List<ACL> zkAcls = ZKUtil.parseACLs(zkAclConf); if (zkAcls.isEmpty()) { zkAcls = Ids.CREATOR_ALL_ACL; } // Parse authentication from configuration. List<ZKAuthInfo> zkAuths = SecurityUtil.getZKAuthInfos(conf, ZK_AUTH_KEY); // Sanity check configuration. Preconditions.checkArgument(zkQuorum != null, "Missing required configuration '%s' for ZooKeeper quorum", ZK_QUORUM_KEY); Preconditions.checkArgument(zkTimeout > 0, "Invalid ZK session timeout %s", zkTimeout); int maxRetryNum = conf.getInt( CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT); elector = new ActiveStandbyElector(zkQuorum, zkTimeout, getParentZnode(), zkAcls, zkAuths, new ElectorCallbacks(), maxRetryNum); }
private int formatZK(boolean force, boolean interactive) throws IOException, InterruptedException, KeeperException { if (elector.parentZNodeExists()) { if (!force && (!interactive || !confirmFormat())) { return ERR_CODE_FORMAT_DENIED; } try { elector.clearParentZNode(); } catch (IOException e) { LOG.error("Unable to clear zk parent znode", e); return 1; } } elector.ensureParentZNode(); return 0; }
elector.joinElection(targetToData(localTarget)); if (quitElectionOnBadState) { quitElectionOnBadState = false; LOG.info("Ensuring that " + localTarget + " does not " + "participate in active master election"); elector.quitElection(false); serviceState = HAServiceState.INITIALIZING; break; LOG.info("Quitting master election for " + localTarget + " and marking that fencing is necessary"); elector.quitElection(true); serviceState = HAServiceState.INITIALIZING; break;
private void reJoinElection(int sleepTime) { LOG.info("Trying to re-establish ZK session"); // Some of the test cases rely on expiring the ZK sessions and // ensuring that the other node takes over. But, there's a race // where the original lease holder could reconnect faster than the other // thread manages to take the lock itself. This lock allows the // tests to block the reconnection. It's a shame that this leaked // into non-test code, but the lock is only acquired here so will never // be contended. sessionReestablishLockForTests.lock(); try { terminateConnection(); sleepFor(sleepTime); // Should not join election even before the SERVICE is reported // as HEALTHY from ZKFC monitoring. if (appData != null) { joinElectionInternal(); } else { LOG.info("Not joining election since service has not yet been " + "reported as healthy."); } } finally { sessionReestablishLockForTests.unlock(); } }
public synchronized void processResult(int rc, String path, Object ctx, String name) { if (isStaleClient(ctx)) return; if (LOG.isDebugEnabled()) { LOG.debug("CreateNode result: " + rc + " for path: " + path if (isSuccess(code)) { if (becomeActive()) { monitorActiveStatus(); } else { reJoinElectionAfterFailureToBecomeActive(); if (isNodeExists(code)) { if (createRetryCount == 0) { becomeStandby(); monitorActiveStatus(); return; if (shouldRetry(code)) { if (createRetryCount < maxRetryNum) { LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); ++createRetryCount; createLockNodeAsync(); return; } else if (isSessionExpired(code)) {
public synchronized void processResult(int rc, String path, Object ctx, Stat stat) { if (isStaleClient(ctx)) return; monitorLockNodePending = false; if (isSuccess(code)) { if (!becomeActive()) { reJoinElectionAfterFailureToBecomeActive(); becomeStandby(); if (isNodeDoesNotExist(code)) { enterNeutralMode(); joinElectionInternal(); return; if (shouldRetry(code)) { if (statRetryCount < maxRetryNum) { ++statRetryCount; monitorLockNodeAsync(); return; } else if (isSessionExpired(code)) { fatalError(errorMessage);
@Test(timeout=15000) public void testHandleSessionExpirationOfStandby() throws Exception { electors[0].ensureParentZNode(); electors[0].joinElection(appDatas[0]); ZooKeeperServer zks = getServer(serverFactory); ActiveStandbyElectorTestUtil.waitForActiveLockData(null, electors[1].joinElection(appDatas[1]); ActiveStandbyElectorTestUtil.waitForElectorState(null, electors[1], State.STANDBY); zks.closeSession(electors[1].getZKSessionIdForTests()); electors[1].quitElection(false); electors[0].quitElection(false);
assertFalse(electors[0].parentZNodeExists()); electors[0].ensureParentZNode(); assertTrue(electors[0].parentZNodeExists()); electors[0].joinElection(appDatas[0]); ActiveStandbyElectorTestUtil.waitForActiveLockData(null, zkServer, PARENT_DIR, appDatas[0]); electors[1].joinElection(appDatas[1]); Mockito.verify(cbs[1], Mockito.timeout(1000)).becomeStandby(); checkFatalsAndReset(); electors[0].quitElection(true); ActiveStandbyElectorTestUtil.waitForActiveLockData(null, zkServer, PARENT_DIR, appDatas[1]); electors[0].joinElection(appDatas[0]); Mockito.verify(cbs[0], Mockito.timeout(1000)).becomeStandby(); checkFatalsAndReset(); electors[1].preventSessionReestablishmentForTests(); try { zkServer.closeSession(electors[1].getZKSessionIdForTests()); Mockito.verify(cbs[0], Mockito.timeout(1000)).becomeActive(); } finally { electors[1].allowSessionReestablishmentForTests(); electors[0].preventSessionReestablishmentForTests();
@Override public void start() { HAServerConfiguration conf = new HAServerConfiguration(); conf.setConf(getConf()); if (!conf.isHAEnabled()) { transitionToActive(); } else { startHttpServer(); try { elector = new ActiveStandbyElector(conf.getZkQuorum(), (int) conf.getZKTimeout(), conf.getElectorZNode(), conf.getZkAcls(), conf.getZkAuths(), this); elector.ensureParentZNode(); // Create fencer before joining election but after creating base dirs fencer = new LlamaHAFencer(this, conf); // Join election elector.joinElection(localNodeBytes); LOG.info("Join election"); } catch (Exception e) { LOG.error( "HA is enabled, but couldn't create leader elector or fencer", e); this.shutdown(1); } } }
if (!elector.parentZNodeExists()) { LOG.error("Unable to start failover controller. " + "Parent znode does not exist.\n" rpcServer.stopAndJoin(); elector.quitElection(true); healthMonitor.shutdown(); healthMonitor.join();
@Override protected void serviceStop() throws Exception { /** * When error occurs in serviceInit(), serviceStop() can be called. * We need null check for the case. */ if (elector != null) { elector.quitElection(false); elector.terminateConnection(); } super.serviceStop(); }
.getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT)); elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout, electionZNode, zkAcls, zkAuths, this, maxRetryNum, false); elector.ensureParentZNode(); if (!isParentZnodeSafe(clusterId)) { notifyFatalError(String.format("invalid data in znode, %s, " +
delayJoiningUntilNanotime = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(1000); elector.quitElection(true); quitElectionOnBadState = true; serviceStateMismatchCount = 0;
private void reset() { state = State.INIT; terminateConnection(); }
/** * @return an {@link HAServiceTarget} for the current active node * in the cluster, or null if no node is active. * @throws IOException if a ZK-related issue occurs * @throws InterruptedException if thread is interrupted */ private HAServiceTarget getCurrentActive() throws IOException, InterruptedException { synchronized (elector) { synchronized (this) { byte[] activeData; try { activeData = elector.getActiveData(); } catch (ActiveNotFoundException e) { return null; } catch (KeeperException ke) { throw new IOException( "Unexpected ZooKeeper issue fetching active node info", ke); } HAServiceTarget oldActive = dataToTarget(activeData); return oldActive; } } }
@Override protected void serviceStart() throws Exception { elector.joinElection(localActiveNodeInfo); super.serviceStart(); }
@Test(timeout=15000) public void testDontJoinElectionOnDisconnectAndReconnect() throws Exception { electors[0].ensureParentZNode(); stopServer(); ActiveStandbyElectorTestUtil.waitForElectorState( null, electors[0], State.NEUTRAL); startServer(); waitForServerUp(hostPort, CONNECTION_TIMEOUT); // Have to sleep to allow time for the clients to reconnect. Thread.sleep(2000); Mockito.verify(cbs[0], Mockito.never()).becomeActive(); Mockito.verify(cbs[1], Mockito.never()).becomeActive(); checkFatalsAndReset(); } }
public synchronized void processResult(int rc, String path, Object ctx, Stat stat) { if (isStaleClient(ctx)) return; monitorLockNodePending = false; if (isSuccess(code)) { if (!becomeActive()) { reJoinElectionAfterFailureToBecomeActive(); becomeStandby(); if (isNodeDoesNotExist(code)) { enterNeutralMode(); joinElectionInternal(); return; if (shouldRetry(code)) { if (statRetryCount < maxRetryNum) { ++statRetryCount; monitorLockNodeAsync(); return; } else if (isSessionExpired(code)) { fatalError(errorMessage);
public synchronized void processResult(int rc, String path, Object ctx, String name) { if (isStaleClient(ctx)) return; if (LOG.isDebugEnabled()) { LOG.debug("CreateNode result: " + rc + " for path: " + path if (isSuccess(code)) { if (becomeActive()) { monitorActiveStatus(); } else { reJoinElectionAfterFailureToBecomeActive(); if (isNodeExists(code)) { if (createRetryCount == 0) { becomeStandby(); monitorActiveStatus(); return; if (shouldRetry(code)) { if (createRetryCount < maxRetryNum) { LOG.debug("Retrying createNode createRetryCount: " + createRetryCount); ++createRetryCount; createLockNodeAsync(); return; } else if (isSessionExpired(code)) {
@Test(timeout=15000) public void testHandleSessionExpirationOfStandby() throws Exception { electors[0].ensureParentZNode(); electors[0].joinElection(appDatas[0]); ZooKeeperServer zks = getServer(serverFactory); ActiveStandbyElectorTestUtil.waitForActiveLockData(null, electors[1].joinElection(appDatas[1]); ActiveStandbyElectorTestUtil.waitForElectorState(null, electors[1], State.STANDBY); zks.closeSession(electors[1].getZKSessionIdForTests()); electors[1].quitElection(false); electors[0].quitElection(false);