/** * Connect to the specified node with the default connection profile * * @param node the node to connect to */ public void connectToNode(DiscoveryNode node) throws ConnectTransportException { connectToNode(node, null); }
/** * Establishes the node connections. If validateInHandshake is set to true, the connection will fail if * node returned in the handshake response is different than the discovery node. */ List<DiscoveryNode> establishNodeConnections(Set<DiscoveryNode> nodes) { for (Iterator<DiscoveryNode> it = nodes.iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!transportService.nodeConnected(node)) { try { logger.trace("connecting to node [{}]", node); transportService.connectToNode(node); } catch (Exception e) { it.remove(); logger.debug(() -> new ParameterizedMessage("failed to connect to discovered node [{}]", node), e); } } } return Collections.unmodifiableList(new ArrayList<>(nodes)); } }
void validateAndConnectIfNeeded(DiscoveryNode node) { assert nodeLocks.isHeldByCurrentThread(node) : "validateAndConnectIfNeeded must be called under lock"; if (lifecycle.stoppedOrClosed() || nodes.containsKey(node) == false) { // we double check existence of node since connectToNode might take time... // nothing to do } else { try { // connecting to an already connected node is a noop transportService.connectToNode(node); nodes.put(node, 0); } catch (Exception e) { Integer nodeFailureCount = nodes.get(node); assert nodeFailureCount != null : node + " didn't have a counter in nodes map"; nodeFailureCount = nodeFailureCount + 1; // log every 6th failure if ((nodeFailureCount % 6) == 1) { final int finalNodeFailureCount = nodeFailureCount; logger.warn(() -> new ParameterizedMessage( "failed to connect to node {} (tried [{}] times)", node, finalNodeFailureCount), e); } nodes.put(node, nodeFailureCount); } } }
@Override protected void handleTransportDisconnect(DiscoveryNode node) { NodeFD nodeFD = nodesFD.remove(node); if (nodeFD == null) { return; } if (connectOnNetworkDisconnect) { NodeFD fd = new NodeFD(node); try { transportService.connectToNode(node); nodesFD.put(node, fd); // we use schedule with a 0 time value to run the pinger on the pool as it will run on later threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, fd); } catch (Exception e) { logger.trace("[node ] [{}] transport disconnected (with verified connect)", node); // clean up if needed, just to be safe.. nodesFD.remove(node, fd); notifyNodeFailure(node, "transport disconnected (with verified connect)"); } } else { logger.trace("[node ] [{}] transport disconnected", node); notifyNodeFailure(node, "transport disconnected"); } }
void handleJoinRequest(final DiscoveryNode node, final ClusterState state, final MembershipAction.JoinCallback callback) { if (nodeJoinController == null) { throw new IllegalStateException("discovery module is not yet started"); } else { // we do this in a couple of places including the cluster update thread. This one here is really just best effort // to ensure we fail as fast as possible. onJoinValidators.stream().forEach(a -> a.accept(node, state)); if (state.getBlocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) { MembershipAction.ensureMajorVersionBarrier(node.getVersion(), state.getNodes().getMinNodeVersion()); } // try and connect to the node, if it fails, we can raise an exception back to the client... transportService.connectToNode(node); // validate the join request, will throw a failure if it fails, which will get back to the // node calling the join request try { membership.sendValidateJoinRequestBlocking(node, state, joinTimeout); } catch (Exception e) { logger.warn(() -> new ParameterizedMessage("failed to validate incoming join request from node [{}]", node), e); callback.onFailure(new IllegalStateException("failure when sending a validation request to node", e)); return; } nodeJoinController.handleJoinRequest(node, callback); } }
try { transportService.connectToNode(masterNode); } catch (Exception e) { logger.warn(() -> new ParameterizedMessage("failed to connect to master [{}], retrying...", masterNode), e);
@Override protected void handleTransportDisconnect(DiscoveryNode node) { synchronized (masterNodeMutex) { if (!node.equals(this.masterNode)) { return; } if (connectOnNetworkDisconnect) { try { transportService.connectToNode(node); // if all is well, make sure we restart the pinger if (masterPinger != null) { masterPinger.stop(); } this.masterPinger = new MasterPinger(); // we use schedule with a 0 time value to run the pinger on the pool as it will run on later threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, masterPinger); } catch (Exception e) { logger.trace("[master] [{}] transport disconnected (with verified connect)", masterNode); notifyMasterFailure(masterNode, null, "transport disconnected (with verified connect)"); } } else { logger.trace("[master] [{}] transport disconnected", node); notifyMasterFailure(node, null, "transport disconnected"); } } }
private void handleAnotherMaster(ClusterState localClusterState, final DiscoveryNode otherMaster, long otherClusterStateVersion, String reason) { assert localClusterState.nodes().isLocalNodeElectedMaster() : "handleAnotherMaster called but current node is not a master"; assert Thread.holdsLock(stateMutex); if (otherClusterStateVersion > localClusterState.version()) { rejoin("zen-disco-discovered another master with a new cluster_state [" + otherMaster + "][" + reason + "]"); } else { // TODO: do this outside mutex logger.warn("discovered [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster ([{}])", otherMaster, otherMaster, reason); try { // make sure we're connected to this node (connect to node does nothing if we're already connected) // since the network connections are asymmetric, it may be that we received a state but have disconnected from the node // in the past (after a master failure, for example) transportService.connectToNode(otherMaster); transportService.sendRequest(otherMaster, DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(localClusterState.nodes().getLocalNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleException(TransportException exp) { logger.warn(() -> new ParameterizedMessage("failed to send rejoin request to [{}]", otherMaster), exp); } }); } catch (Exception e) { logger.warn(() -> new ParameterizedMessage("failed to send rejoin request to [{}]", otherMaster), e); } } }
/** * Connect to the specified node with the default connection profile * * @param node the node to connect to */ public void connectToNode(DiscoveryNode node) throws ConnectTransportException { connectToNode(node, null); }
public void connectToNode(DiscoveryNode node) throws ConnectTransportException { connectToNode(node, null); }
/** * validates a set of potentially newly discovered nodes and returns an immutable * list of the nodes that has passed. */ protected List<DiscoveryNode> validateNewNodes(Set<DiscoveryNode> nodes) { for (Iterator<DiscoveryNode> it = nodes.iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!transportService.nodeConnected(node)) { try { logger.trace("connecting to node [{}]", node); transportService.connectToNode(node); } catch (Exception e) { it.remove(); logger.debug((Supplier<?>) () -> new ParameterizedMessage("failed to connect to discovered node [{}]", node), e); } } } return Collections.unmodifiableList(new ArrayList<>(nodes)); }
/** * Establishes the node connections. If validateInHandshake is set to true, the connection will fail if * node returned in the handshake response is different than the discovery node. */ List<DiscoveryNode> establishNodeConnections(Set<DiscoveryNode> nodes) { for (Iterator<DiscoveryNode> it = nodes.iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!transportService.nodeConnected(node)) { try { logger.trace("connecting to node [{}]", node); transportService.connectToNode(node); } catch (Exception e) { it.remove(); logger.debug(() -> new ParameterizedMessage("failed to connect to discovered node [{}]", node), e); } } } return Collections.unmodifiableList(new ArrayList<>(nodes)); } }
/** * validates a set of potentially newly discovered nodes and returns an immutable * list of the nodes that has passed. */ protected List<DiscoveryNode> validateNewNodes(Set<DiscoveryNode> nodes) { for (Iterator<DiscoveryNode> it = nodes.iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!transportService.nodeConnected(node)) { try { logger.trace("connecting to node [{}]", node); transportService.connectToNode(node); } catch (Throwable e) { it.remove(); logger.debug("failed to connect to discovered node [" + node + "]", e); } } } return Collections.unmodifiableList(new ArrayList<>(nodes)); }
void validateAndConnectIfNeeded(DiscoveryNode node) { assert nodeLocks.isHeldByCurrentThread(node) : "validateAndConnectIfNeeded must be called under lock"; if (lifecycle.stoppedOrClosed() || nodes.containsKey(node) == false) { // we double check existence of node since connectToNode might take time... // nothing to do } else { try { // connecting to an already connected node is a noop transportService.connectToNode(node); nodes.put(node, 0); } catch (Exception e) { Integer nodeFailureCount = nodes.get(node); assert nodeFailureCount != null : node + " didn't have a counter in nodes map"; nodeFailureCount = nodeFailureCount + 1; // log every 6th failure if ((nodeFailureCount % 6) == 1) { final int finalNodeFailureCount = nodeFailureCount; logger.warn(() -> new ParameterizedMessage( "failed to connect to node {} (tried [{}] times)", node, finalNodeFailureCount), e); } nodes.put(node, nodeFailureCount); } } }
@Override protected void handleTransportDisconnect(DiscoveryNode node) { NodeFD nodeFD = nodesFD.remove(node); if (nodeFD == null) { return; } if (connectOnNetworkDisconnect) { NodeFD fd = new NodeFD(node); try { transportService.connectToNode(node); nodesFD.put(node, fd); // we use schedule with a 0 time value to run the pinger on the pool as it will run on later threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, fd); } catch (Exception e) { logger.trace("[node ] [{}] transport disconnected (with verified connect)", node); // clean up if needed, just to be safe.. nodesFD.remove(node, fd); notifyNodeFailure(node, "transport disconnected (with verified connect)"); } } else { logger.trace("[node ] [{}] transport disconnected", node); notifyNodeFailure(node, "transport disconnected"); } }
@Override protected void handleTransportDisconnect(DiscoveryNode node) { NodeFD nodeFD = nodesFD.remove(node); if (nodeFD == null) { return; } if (connectOnNetworkDisconnect) { NodeFD fd = new NodeFD(node); try { transportService.connectToNode(node); nodesFD.put(node, fd); // we use schedule with a 0 time value to run the pinger on the pool as it will run on later threadPool.schedule(TimeValue.timeValueMillis(0), ThreadPool.Names.SAME, fd); } catch (Exception e) { logger.trace("[node ] [{}] transport disconnected (with verified connect)", node); // clean up if needed, just to be safe.. nodesFD.remove(node, fd); notifyNodeFailure(node, "transport disconnected (with verified connect)"); } } else { logger.trace("[node ] [{}] transport disconnected", node); notifyNodeFailure(node, "transport disconnected"); } }
private void innerStart(final DiscoveryNode masterNode) { this.masterNode = masterNode; this.retryCount = 0; this.notifiedMasterFailure.set(false); // try and connect to make sure we are connected try { transportService.connectToNode(masterNode); } catch (final Exception e) { // notify master failure (which stops also) and bail.. notifyMasterFailure(masterNode, "failed to perform initial connect [" + e.getMessage() + "]"); return; } if (masterPinger != null) { masterPinger.stop(); } this.masterPinger = new MasterPinger(); // we start pinging slightly later to allow the chosen master to complete it's own master election threadPool.schedule(pingInterval, ThreadPool.Names.SAME, masterPinger); }
void handleJoinRequest(final DiscoveryNode node, final ClusterState state, final MembershipAction.JoinCallback callback) { if (!transportService.addressSupported(node.getAddress().getClass())) { // TODO, what should we do now? Maybe inform that node that its crap? logger.warn("received a wrong address type from [{}], ignoring...", node); } else if (nodeJoinController == null) { throw new IllegalStateException("discovery module is not yet started"); } else { // we do this in a couple of places including the cluster update thread. This one here is really just best effort // to ensure we fail as fast as possible. MembershipAction.ensureIndexCompatibility(node.getVersion(), state.getMetaData()); // try and connect to the node, if it fails, we can raise an exception back to the client... transportService.connectToNode(node); // validate the join request, will throw a failure if it fails, which will get back to the // node calling the join request try { membership.sendValidateJoinRequestBlocking(node, state, joinTimeout); } catch (Exception e) { logger.warn((Supplier<?>) () -> new ParameterizedMessage("failed to validate incoming join request from node [{}]", node), e); callback.onFailure(new IllegalStateException("failure when sending a validation request to node", e)); return; } nodeJoinController.handleJoinRequest(node, callback); } }
@Override protected void doRun() throws Exception { // make sure we're connected to this node (connect to node does nothing if we're already connected) // since the network connections are asymmetric, it may be that we received a state but have disconnected from the node // in the past (after a master failure, for example) transportService.connectToNode(otherMaster); transportService.sendRequest(otherMaster, DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(localNode().getId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleException(TransportException exp) { logger.warn((Supplier<?>) () -> new ParameterizedMessage("failed to send rejoin request to [{}]", otherMaster), exp); } }); } });
private void handleAnotherMaster(ClusterState localClusterState, final DiscoveryNode otherMaster, long otherClusterStateVersion, String reason) { assert localClusterState.nodes().isLocalNodeElectedMaster() : "handleAnotherMaster called but current node is not a master"; assert Thread.holdsLock(stateMutex); if (otherClusterStateVersion > localClusterState.version()) { rejoin("zen-disco-discovered another master with a new cluster_state [" + otherMaster + "][" + reason + "]"); } else { // TODO: do this outside mutex logger.warn("discovered [{}] which is also master but with an older cluster_state, telling [{}] to rejoin the cluster ([{}])", otherMaster, otherMaster, reason); try { // make sure we're connected to this node (connect to node does nothing if we're already connected) // since the network connections are asymmetric, it may be that we received a state but have disconnected from the node // in the past (after a master failure, for example) transportService.connectToNode(otherMaster); transportService.sendRequest(otherMaster, DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequest(localClusterState.nodes().getLocalNodeId()), new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleException(TransportException exp) { logger.warn(() -> new ParameterizedMessage("failed to send rejoin request to [{}]", otherMaster), exp); } }); } catch (Exception e) { logger.warn(() -> new ParameterizedMessage("failed to send rejoin request to [{}]", otherMaster), e); } } }