Converge divergent shard-id persisted in nodes.conf to primary's shard id (#2174)

Fixes #2171

Handle divergent shard-id across primary and replica from nodes.conf and
reconcile all the nodes in the shard to the primary node's shard-id.

---------

Signed-off-by: Harkrishn Patro <harkrisp@amazon.com>
Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
This commit is contained in:
Harkrishn Patro 2025-06-20 15:07:13 -07:00 committed by Viktor Söderqvist
parent d462c7d977
commit 3dfcc504ac
7 changed files with 48 additions and 6 deletions

View File

@ -293,10 +293,16 @@ int auxShardIdSetter(clusterNode *n, void *value, size_t length) {
}
memcpy(n->shard_id, value, CLUSTER_NAMELEN);
/* if n already has replicas, make sure they all agree
* on the shard id */
* on the shard id. If not, update them. */
for (int i = 0; i < n->num_replicas; i++) {
if (memcmp(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
return C_ERR;
serverLog(LL_NOTICE,
"Node %.40s has a different shard id (%.40s) than its primary's shard id %.40s (%.40s). "
"Updating replica's shard id to match primary's shard id.",
n->replicas[i]->name, n->replicas[i]->shard_id, n->name, n->shard_id);
clusterRemoveNodeFromShard(n->replicas[i]);
memcpy(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN);
clusterAddNodeToShard(n->shard_id, n->replicas[i]);
}
}
clusterAddNodeToShard(value, n);
@ -696,10 +702,16 @@ int clusterLoadConfig(char *filename) {
clusterAddNodeToShard(primary->shard_id, n);
} else if (clusterGetNodesInMyShard(primary) != NULL &&
memcmp(primary->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
/* If the primary has been added to a shard, make sure this
* node has the same persisted shard id as the primary. */
sdsfreesplitres(argv, argc);
goto fmterr;
/* If the primary has been added to a shard and this replica has
* a different shard id stored in nodes.conf, update it to match
* the primary instead of aborting the startup. */
serverLog(LL_NOTICE,
"Node %.40s has a different shard id (%.40s) than its primary %.40s (%.40s). "
"Updating replica's shard id to match primary's shard id.",
n->name, n->shard_id, primary->name, primary->shard_id);
clusterRemoveNodeFromShard(n);
memcpy(n->shard_id, primary->shard_id, CLUSTER_NAMELEN);
clusterAddNodeToShard(primary->shard_id, n);
}
n->replicaof = primary;
clusterNodeAddReplica(primary, n);

View File

@ -0,0 +1,3 @@
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
vars currentEpoch 13 lastVoteEpoch 9

View File

@ -0,0 +1,3 @@
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383
vars currentEpoch 13 lastVoteEpoch 9

View File

@ -0,0 +1,3 @@
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
vars currentEpoch 13 lastVoteEpoch 9

View File

@ -0,0 +1,3 @@
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383
vars currentEpoch 13 lastVoteEpoch 9

View File

@ -288,6 +288,7 @@ proc cleanup {} {
if {!$::quiet} {puts -nonewline "Cleanup: may take some time... "}
flush stdout
catch {exec rm -rf {*}[glob tests/tmp/valkey.conf.*]}
catch {exec rm -rf {*}[glob tests/tmp/nodes.conf.*]}
catch {exec rm -rf {*}[glob tests/tmp/server*.*]}
catch {exec rm -rf {*}[glob tests/tmp/*.acl.*]}
if {!$::quiet} {puts "OK"}

View File

@ -0,0 +1,17 @@
tags {external:skip cluster singledb} {
set old_singledb $::singledb
set ::singledb 1
# Start a cluster with a divergent shard ID configuration
test "divergent cluster shardid conflict" {
for {set i 1} {$i <= 4} {incr i} {
if {$::verbose} { puts "Testing for tests/assets/divergent-shard-$i.conf"; flush stdout;}
exec cp -f tests/assets/divergent-shard-$i.conf tests/tmp/nodes.conf.divergent
start_server {overrides {"cluster-enabled" "yes" "cluster-config-file" "../nodes.conf.divergent"}} {
set shardid [r CLUSTER MYSHARDID]
set count [exec grep -c $shardid tests/tmp/nodes.conf.divergent];
assert_equal $count 2 "Expect shard ID to be present twice in the configuration file"
}
}
}
set ::singledb $old_singledb
}