mirror of https://github.com/valkey-io/valkey
Converge divergent shard-id persisted in nodes.conf to primary's shard id (#2174)
Fixes #2171 Handle divergent shard-id across primary and replica from nodes.conf and reconcile all the nodes in the shard to the primary node's shard-id. --------- Signed-off-by: Harkrishn Patro <harkrisp@amazon.com> Signed-off-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
This commit is contained in:
parent
d462c7d977
commit
3dfcc504ac
|
|
@ -293,10 +293,16 @@ int auxShardIdSetter(clusterNode *n, void *value, size_t length) {
|
|||
}
|
||||
memcpy(n->shard_id, value, CLUSTER_NAMELEN);
|
||||
/* if n already has replicas, make sure they all agree
|
||||
* on the shard id */
|
||||
* on the shard id. If not, update them. */
|
||||
for (int i = 0; i < n->num_replicas; i++) {
|
||||
if (memcmp(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
|
||||
return C_ERR;
|
||||
serverLog(LL_NOTICE,
|
||||
"Node %.40s has a different shard id (%.40s) than its primary's shard id %.40s (%.40s). "
|
||||
"Updating replica's shard id to match primary's shard id.",
|
||||
n->replicas[i]->name, n->replicas[i]->shard_id, n->name, n->shard_id);
|
||||
clusterRemoveNodeFromShard(n->replicas[i]);
|
||||
memcpy(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN);
|
||||
clusterAddNodeToShard(n->shard_id, n->replicas[i]);
|
||||
}
|
||||
}
|
||||
clusterAddNodeToShard(value, n);
|
||||
|
|
@ -696,10 +702,16 @@ int clusterLoadConfig(char *filename) {
|
|||
clusterAddNodeToShard(primary->shard_id, n);
|
||||
} else if (clusterGetNodesInMyShard(primary) != NULL &&
|
||||
memcmp(primary->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
|
||||
/* If the primary has been added to a shard, make sure this
|
||||
* node has the same persisted shard id as the primary. */
|
||||
sdsfreesplitres(argv, argc);
|
||||
goto fmterr;
|
||||
/* If the primary has been added to a shard and this replica has
|
||||
* a different shard id stored in nodes.conf, update it to match
|
||||
* the primary instead of aborting the startup. */
|
||||
serverLog(LL_NOTICE,
|
||||
"Node %.40s has a different shard id (%.40s) than its primary %.40s (%.40s). "
|
||||
"Updating replica's shard id to match primary's shard id.",
|
||||
n->name, n->shard_id, primary->name, primary->shard_id);
|
||||
clusterRemoveNodeFromShard(n);
|
||||
memcpy(n->shard_id, primary->shard_id, CLUSTER_NAMELEN);
|
||||
clusterAddNodeToShard(primary->shard_id, n);
|
||||
}
|
||||
n->replicaof = primary;
|
||||
clusterNodeAddReplica(primary, n);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383
|
||||
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
|
||||
vars currentEpoch 13 lastVoteEpoch 9
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
|
||||
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383
|
||||
vars currentEpoch 13 lastVoteEpoch 9
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383
|
||||
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
|
||||
vars currentEpoch 13 lastVoteEpoch 9
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
|
||||
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383
|
||||
vars currentEpoch 13 lastVoteEpoch 9
|
||||
|
|
@ -288,6 +288,7 @@ proc cleanup {} {
|
|||
if {!$::quiet} {puts -nonewline "Cleanup: may take some time... "}
|
||||
flush stdout
|
||||
catch {exec rm -rf {*}[glob tests/tmp/valkey.conf.*]}
|
||||
catch {exec rm -rf {*}[glob tests/tmp/nodes.conf.*]}
|
||||
catch {exec rm -rf {*}[glob tests/tmp/server*.*]}
|
||||
catch {exec rm -rf {*}[glob tests/tmp/*.acl.*]}
|
||||
if {!$::quiet} {puts "OK"}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,17 @@
|
|||
tags {external:skip cluster singledb} {
|
||||
set old_singledb $::singledb
|
||||
set ::singledb 1
|
||||
# Start a cluster with a divergent shard ID configuration
|
||||
test "divergent cluster shardid conflict" {
|
||||
for {set i 1} {$i <= 4} {incr i} {
|
||||
if {$::verbose} { puts "Testing for tests/assets/divergent-shard-$i.conf"; flush stdout;}
|
||||
exec cp -f tests/assets/divergent-shard-$i.conf tests/tmp/nodes.conf.divergent
|
||||
start_server {overrides {"cluster-enabled" "yes" "cluster-config-file" "../nodes.conf.divergent"}} {
|
||||
set shardid [r CLUSTER MYSHARDID]
|
||||
set count [exec grep -c $shardid tests/tmp/nodes.conf.divergent];
|
||||
assert_equal $count 2 "Expect shard ID to be present twice in the configuration file"
|
||||
}
|
||||
}
|
||||
}
|
||||
set ::singledb $old_singledb
|
||||
}
|
||||
Loading…
Reference in New Issue