private synchronized void becomeStandby() { LOG.info("ZK Election indicated that " + localTarget + " should become standby"); try { int timeout = FailoverController.getGracefulFenceTimeout(conf); localTarget.getProxy(conf, timeout).transitionToStandby(createReqInfo()); LOG.info("Successfully transitioned " + localTarget + " to standby state"); } catch (Exception e) { LOG.error("Couldn't transition " + localTarget + " to standby state", e); // TODO handle this. It's a likely case since we probably got fenced // at the same time. } serviceState = HAServiceState.STANDBY; }
/** * Hook to allow subclasses to add any parameters they would like to * expose to fencing implementations/scripts. Fencing methods are free * to use this map as they see fit -- notably, the shell script * implementation takes each entry, prepends 'target_', substitutes * '_' for '.', and adds it to the environment of the script. * * Subclass implementations should be sure to delegate to the superclass * implementation as well as adding their own keys. * * @param ret map which can be mutated to pass parameters to the fencer */ protected void addFencingParameters(Map<String, String> ret) { ret.put(ADDRESS_SUBST_KEY, String.valueOf(getAddress())); ret.put(HOST_SUBST_KEY, getAddress().getHostName()); ret.put(PORT_SUBST_KEY, String.valueOf(getAddress().getPort())); }
/** * Ask the remote zkfc to cede its active status and wait for the specified * timeout before attempting to claim leader status. * @param remote node to ask * @param timeout amount of time to cede * @return the {@link ZKFCProtocol} used to talk to the ndoe * @throws IOException */ private ZKFCProtocol cedeRemoteActive(HAServiceTarget remote, int timeout) throws IOException { LOG.info("Asking " + remote + " to cede its active state for " + timeout + "ms"); ZKFCProtocol oldZkfc = remote.getZKFCProxy(conf, timeout); oldZkfc.cedeActive(timeout); return oldZkfc; }
/** * @return a proxy to connect to the target HA Service. */ public HAServiceProtocol getProxy(Configuration conf, int timeoutMs) throws IOException { return getProxyForAddress(conf, timeoutMs, getAddress()); }
/** * Returns a proxy to connect to the target HA service for health monitoring. * If {@link #getHealthMonitorAddress()} is implemented to return a non-null * address, then this proxy will connect to that address. Otherwise, the * returned proxy defaults to using {@link #getAddress()}, which means this * method's behavior is identical to {@link #getProxy(Configuration, int)}. * * @param conf Configuration * @param timeoutMs timeout in milliseconds * @return a proxy to connect to the target HA service for health monitoring * @throws IOException if there is an error */ public HAServiceProtocol getHealthMonitorProxy(Configuration conf, int timeoutMs) throws IOException { InetSocketAddress addr = getHealthMonitorAddress(); if (addr == null) { addr = getAddress(); } return getProxyForAddress(conf, timeoutMs, addr); }
protected int getAllServiceState() { Collection<String> targetIds = getTargetIds(null); if (targetIds.isEmpty()) { errOut.println("Failed to get service IDs"); return -1; } for (String targetId : targetIds) { HAServiceTarget target = resolveTarget(targetId); String address = target.getAddress().getHostName() + ":" + target.getAddress().getPort(); try { HAServiceProtocol proto = target.getProxy(getConf(), rpcTimeoutForChecks); out.println(String.format("%-50s %-10s", address, proto .getServiceStatus().getState())); } catch (IOException e) { out.println(String.format("%-50s %-10s", address, "Failed to connect: " + e.getMessage())); } } return 0; }
if (oldActive.getAddress().equals(localTarget.getAddress())) { LOG.info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success."); ZKFCProtocol oldZkfc = oldActive.getZKFCProxy(conf, timeout); oldZkfc.cedeActive(timeout);
boolean forceActive) throws FailoverFailedException { Preconditions.checkArgument(fromSvc.getFencer() != null, "failover requires a fencer"); preFailoverChecks(fromSvc, toSvc, forceActive); if (!fromSvc.getFencer().fence(fromSvc)) { throw new FailoverFailedException("Unable to fence " + fromSvc + ". Fencing failed."); try { HAServiceProtocolHelper.transitionToActive( toSvc.getProxy(conf, rpcTimeoutToNewActive), createReqInfo()); } catch (ServiceFailedException sfe) {
@Override protected byte[] targetToData(HAServiceTarget target) { InetSocketAddress addr = target.getAddress(); return ActiveNodeInfo.newBuilder() .setHostname(addr.getHostName()) .setPort(addr.getPort()) .setZkfcPort(target.getZKFCAddress().getPort()) .setNameserviceId(localNNTarget.getNameServiceId()) .setNamenodeId(localNNTarget.getNameNodeId()) .build() .toByteArray(); }
private void doFence(HAServiceTarget target) { LOG.info("Should fence: " + target); boolean gracefulWorked = new FailoverController(conf, RequestSource.REQUEST_BY_ZKFC).tryGracefulFence(target); if (gracefulWorked) { // It's possible that it's in standby but just about to go into active, // no? Is there some race here? LOG.info("Successfully transitioned " + target + " to standby " + "state without fencing"); return; } try { target.checkFencingConfigured(); } catch (BadFencingConfigurationException e) { LOG.error("Couldn't fence old active " + target, e); recordActiveAttempt(new ActiveAttemptRecord(false, "Unable to fence old active")); throw new RuntimeException(e); } if (!target.getFencer().fence(target)) { throw new RuntimeException("Unable to fence " + target); } }
public final Map<String, String> getFencingParameters() { Map<String, String> ret = Maps.newHashMap(); addFencingParameters(ret); return ret; }
/** * Ensure that we are allowed to manually manage the HA state of the target * service. If automatic failover is configured, then the automatic * failover controllers should be doing state management, and it is generally * an error to use the HAAdmin command line to do so. * * @param target the target to check * @return true if manual state management is allowed */ private boolean checkManualStateManagementOK(HAServiceTarget target) { if (target.isAutoFailoverEnabled()) { if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) { errOut.println( "Automatic failover is enabled for " + target + "\n" + "Refusing to manually manage HA state, since it may cause\n" + "a split-brain scenario or other incorrect state.\n" + "If you are very sure you know what you are doing, please \n" + "specify the --" + FORCEMANUAL + " flag."); return false; } else { LOG.warn("Proceeding with manual HA state management even though\n" + "automatic failover is enabled for " + target); return true; } } return true; }
@Override protected HAServiceTarget resolveTarget(String nnId) { HAServiceTarget target = super.resolveTarget(nnId); HAServiceTarget spy = Mockito.spy(target); // OVerride the target to return our mock protocol try { Mockito.doReturn(mockProtocol).when(spy).getProxy( Mockito.<Configuration>any(), Mockito.anyInt()); Mockito.doReturn(mockZkfcProtocol).when(spy).getZKFCProxy( Mockito.<Configuration>any(), Mockito.anyInt()); } catch (IOException e) { throw new AssertionError(e); // mock setup doesn't really throw } return spy; } };
/** * @return a proxy to the ZKFC which is associated with this HA service. */ public ZKFCProtocol getZKFCProxy(Configuration conf, int timeoutMs) throws IOException { Configuration confCopy = new Configuration(conf); // Lower the timeout so we quickly fail to connect confCopy.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY, 1); SocketFactory factory = NetUtils.getDefaultSocketFactory(confCopy); return new ZKFCProtocolClientSideTranslatorPB( getZKFCAddress(), confCopy, factory, timeoutMs); }
/** * Add information about the target to the the environment of the * subprocess. * * @param target * @param environment */ private void addTargetInfoAsEnvVars(HAServiceTarget target, Map<String, String> environment) { for (Map.Entry<String, String> e : target.getFencingParameters().entrySet()) { String key = TARGET_PREFIX + e.getKey(); key = key.replace('.', '_'); environment.put(key, e.getValue()); } } }
localTarget.checkFencingConfigured(); } catch (BadFencingConfigurationException e) { LOG.error("Fencing is not configured for " + localTarget + ".\n" +
if (oldActive.getAddress().equals(localTarget.getAddress())) { LOG.info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success."); ZKFCProtocol oldZkfc = oldActive.getZKFCProxy(conf, timeout); oldZkfc.cedeActive(timeout);
boolean forceActive) throws FailoverFailedException { Preconditions.checkArgument(fromSvc.getFencer() != null, "failover requires a fencer"); preFailoverChecks(fromSvc, toSvc, forceActive); if (!fromSvc.getFencer().fence(fromSvc)) { throw new FailoverFailedException("Unable to fence " + fromSvc + ". Fencing failed."); try { HAServiceProtocolHelper.transitionToActive( toSvc.getProxy(conf, rpcTimeoutToNewActive), createReqInfo()); } catch (ServiceFailedException sfe) {
HAServiceProtocol toSvc; if (from.getAddress().equals(target.getAddress())) { throw new FailoverFailedException( "Can't failover a service to itself"); toSvc = target.getProxy(conf, rpcTimeoutToNewActive); toSvcStatus = toSvc.getServiceStatus(); } catch (IOException e) {
@Override protected byte[] targetToData(HAServiceTarget target) { InetSocketAddress addr = target.getAddress(); return ActiveNodeInfo.newBuilder() .setHostname(addr.getHostName()) .setPort(addr.getPort()) .setZkfcPort(target.getZKFCAddress().getPort()) .setNameserviceId(localNNTarget.getNameServiceId()) .setNamenodeId(localNNTarget.getNameNodeId()) .build() .toByteArray(); }