From da3c43d76bb8667df43e53227608e1b3344d0f1c Mon Sep 17 00:00:00 2001 From: Zhijun Liao <158313558+zhijun42@users.noreply.github.com> Date: Wed, 26 Nov 2025 09:07:21 +0800 Subject: [PATCH] Additional log information for cluster accept handler and message processing (#2815) Enhance debugging for cluster logs [1] Add human node names in cluster tests so that we can easily understand which nodes we are interacting with: ``` pong packet received from: 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) from client: :0 node 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) announces that it is a primary in shard c6d1152caee49a5e70cb4b77d1549386078be603 Reconfiguring node 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) as primary for shard c6d1152caee49a5e70cb4b77d1549386078be603 Configuration change detected. Reconfiguring myself as a replica of node 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) in shard c6d1152caee49a5e70cb4b77d1549386078be603 ``` [2] Currently there are logs showing the address of incoming connections: ``` Accepting cluster node connection from 127.0.0.1:59956 Accepting cluster node connection from 127.0.0.1:59957 Accepting cluster node connection from 127.0.0.1:59958 Accepting cluster node connection from 127.0.0.1:59959 ``` but we have no idea which nodes these connections refer to. I added a logging statement when the node is set to the inbound link connection. ``` Bound cluster node 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) to connection of client 127.0.0.1:59956 ``` [3] Add a debug log when processing a packet to show the packet type, sender node name, and sender client port (this also has the benefit of telling us whether this is an inbound or outbound link). ``` pong packet received from: 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) from client: :0 ping packet received from: 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) from client: 127.0.0.1:59973 fail packet received from: 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) from client: 127.0.0.1:59973 auth-req packet received from: 92a960ffd62f1bd04efeb260b30fe9ca6b9294ed (R4) from client: 127.0.0.1:59973 ``` --------- Signed-off-by: Zhijun --- src/cluster_legacy.c | 27 +++++++++++++++++++++++--- tests/support/cluster_util.tcl | 3 +++ tests/unit/cluster/manual-failover.tcl | 10 +++++----- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index b7df963b4..a1bdbf9a5 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1708,6 +1708,17 @@ void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { serverAssert(!node->inbound_link); node->inbound_link = link; link->node = node; + if (server.verbosity <= LL_VERBOSE) { + char ip[NET_IP_STR_LEN]; + int port; + if (connAddrPeerName(link->conn, ip, sizeof(ip), &port) != -1) { + serverLog(LL_VERBOSE, "Bound cluster node %.40s (%s) to connection of client %s:%d", + node->name, node->human_nodename, ip, port); + } else { + serverLog(LL_VERBOSE, "Error resolving the inbound connection address of node %.40s (%s)", + node->name, node->human_nodename); + } + } } static void clusterConnAcceptHandler(connection *conn) { @@ -3833,11 +3844,21 @@ int clusterProcessPacket(clusterLink *link) { clusterSendPing(link, CLUSTERMSG_TYPE_PONG); } + if (server.verbosity <= LL_DEBUG) { + char ip[NET_IP_STR_LEN]; + int port; + if (connAddrPeerName(link->conn, ip, sizeof(ip), &port) != -1) { + serverLog(LL_DEBUG, "%s packet received from: %.40s (%s) from client: %s:%d", + clusterGetMessageTypeString(type), + clusterLinkGetNodeName(link), clusterLinkGetHumanNodeName(link), + ip, port); + } else { + serverLog(LL_DEBUG, "Error resolving the address of packet sender %.40s (%s)", + clusterLinkGetNodeName(link), clusterLinkGetHumanNodeName(link)); + } + } /* PING, PONG, MEET: process config information. */ if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) { - serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type), - clusterLinkGetNodeName(link)); - if (sender && nodeInMeetState(sender)) { /* Once we get a response for MEET from the sender, we can stop sending more MEET. */ sender->flags &= ~CLUSTER_NODE_MEET; diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index ee14c5864..1dd112d3e 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -214,6 +214,9 @@ proc cluster_setup {masters replicas node_count slot_allocator replica_allocator for {set i 0} {$i < $node_count} {incr i} { R $i CLUSTER SET-CONFIG-EPOCH $config_epoch incr config_epoch + # Make it easier to understand how the server interacts with + # other nodes when reading the server logs. + R $i CONFIG SET cluster-announce-human-nodename "R$i" } # Have all nodes meet diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 56952ff00..e8871a1b2 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -478,21 +478,21 @@ start_cluster 3 1 {tags {external:skip cluster}} { # verify we print the logs. # Both importing slots and migrating slots are move to R3. - set pattern "*Failover occurred in migration source. Update importing source for slot 0 to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*Failover occurred in migration source. Update importing source for slot 0 to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 - set pattern "*Failover occurred in migration target. Slot 5462 is now being migrated to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*Failover occurred in migration target. Slot 5462 is now being migrated to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 # Both slots are move to R3. set R0_slots 5462 - set pattern "*A failover occurred in shard $R3_shardid; node $R0_nodeid () lost $R0_slots slot(s) and failed over to node $R3_nodeid*" + set pattern "*A failover occurred in shard $R3_shardid; node $R0_nodeid * lost $R0_slots slot(s) and failed over to node $R3_nodeid*" verify_log_message -1 $pattern $loglines1 verify_log_message -2 $pattern $loglines2 # Both importing slots and migrating slots are move to R3. - set pattern "*A failover occurred in migration source. Update importing source of 1 slot(s) to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*A failover occurred in migration source. Update importing source of 1 slot(s) to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 - set pattern "*A failover occurred in migration target. Update migrating target of 1 slot(s) to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*A failover occurred in migration target. Update migrating target of 1 slot(s) to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 R 1 debug disable-cluster-reconnection 0