mirror of https://github.com/valkey-io/valkey
Sentinel: fix regression requiring "+failover" ACL in failover path (#2780)
Since Valkey Sentinel 9.0, sentinel tries to abort an ongoing failover when changing the role of a monitored instance. Since the result of the command is ignored, the "FAILOVER ABORT" command is sent irrespective of the actual failover status. However, when using the documented pre 9.0 ACLs for a dedicated sentinel user, the FAILOVER command is not allowed and _all_ failover cases fail. (Additionally, the necessary ACL adaptation was not communicated well.) Address this by: - Updating the documentation in "sentinel.conf" to reflect the need for an adapted ACL - Only abort a failover when sentinel detected an ongoing (probably stuck) failover. This means that standard failover and manual failover continue to work with unchanged pre 9.0 ACLs. Only the new "SENTINEL FAILOVER COORDINATED" requires to adapt the ACL on all Valkey nodes. - Actually use a dedicated sentinel user and ACLs when testing standard failover, manual failover, and manual coordinated failover. Fixes #2779 Signed-off-by: Simon Baatz <gmbnomis@gmail.com>
This commit is contained in:
parent
189c69e315
commit
6cbc1a31d7
|
|
@ -119,8 +119,11 @@ sentinel monitor mymaster 127.0.0.1 6379 2
|
|||
# In the Valkey servers side, the ACL to provide just minimal access to
|
||||
# Sentinel instances, should be configured along the following lines:
|
||||
#
|
||||
# user sentinel-user >somepassword +client +subscribe +publish \
|
||||
# user sentinel-user >somepassword +subscribe +publish +failover +script|kill \
|
||||
# +ping +info +multi +slaveof +config +client +exec &__sentinel__:hello on
|
||||
#
|
||||
# Since Valkey Sentinel 9.0, the sentinel user requires the +failover permission
|
||||
# on all monitored Valkey instances for proper operation.
|
||||
|
||||
# sentinel down-after-milliseconds <master-name> <milliseconds>
|
||||
#
|
||||
|
|
|
|||
|
|
@ -4859,7 +4859,7 @@ int sentinelSendReplicaOf(sentinelValkeyInstance *ri, const sentinelAddr *addr)
|
|||
if (retval == C_ERR) return retval;
|
||||
ri->link->pending_commands++;
|
||||
|
||||
if (ri->monitored_instance_failover_state != SENTINEL_MONITORED_INSTANCE_FAILOVER_NS) {
|
||||
if (ri->monitored_instance_failover_state == SENTINEL_MONITORED_INSTANCE_FAILOVER) {
|
||||
retval = valkeyAsyncCommand(ri->link->cc, sentinelDiscardReplyCallback, ri, "%s ABORT",
|
||||
sentinelInstanceMapCommand(ri, "FAILOVER"));
|
||||
if (retval == C_ERR) return retval;
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@ foreach_sentinel_id id {
|
|||
S $id sentinel debug default-down-after 1000
|
||||
}
|
||||
|
||||
set ::user "sentinel-user"
|
||||
set ::password "sentinel-password"
|
||||
|
||||
if {$::simulate_error} {
|
||||
test "This test will fail" {
|
||||
fail "Simulated error"
|
||||
|
|
@ -76,6 +79,10 @@ test "SENTINEL SIMULATE-FAILURE HELP list supported flags" {
|
|||
}
|
||||
|
||||
test "Basic failover works if the primary is down" {
|
||||
# Explicitly forbid the FAILOVER command to ensure backward compatibility with
|
||||
# ACLs that were documented for Valkey < 9.0
|
||||
configure_sentinel_user_acl $::user $::password 0
|
||||
|
||||
set old_port [RPort $master_id]
|
||||
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
|
||||
assert {[lindex $addr 1] == $old_port}
|
||||
|
|
@ -116,6 +123,7 @@ test "The old primary eventually gets reconfigured as a slave" {
|
|||
} else {
|
||||
fail "Old master not reconfigured as slave of new master"
|
||||
}
|
||||
reset_sentinel_user_acl $::user
|
||||
}
|
||||
|
||||
test "ODOWN is not possible without N (quorum) Sentinels reports" {
|
||||
|
|
|
|||
|
|
@ -38,31 +38,6 @@ proc server_reset_acl {id} {
|
|||
assert_equal {OK} [R $id CONFIG SET primaryauth ""]
|
||||
}
|
||||
|
||||
proc verify_sentinel_connect_replicas {id} {
|
||||
foreach replica [S $id SENTINEL REPLICAS mymaster] {
|
||||
if {[string match "*disconnected*" [dict get $replica flags]]} {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
proc wait_for_sentinels_connect_servers { {is_connect 1} } {
|
||||
foreach_sentinel_id id {
|
||||
wait_for_condition 1000 50 {
|
||||
[string match "*disconnected*" [dict get [S $id SENTINEL PRIMARY mymaster] flags]] != $is_connect
|
||||
} else {
|
||||
fail "At least some sentinel can't connect to master"
|
||||
}
|
||||
|
||||
wait_for_condition 1000 50 {
|
||||
[verify_sentinel_connect_replicas $id] == $is_connect
|
||||
} else {
|
||||
fail "At least some sentinel can't connect to replica"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
test "Sentinels (re)connection following SENTINEL SET myprimary auth-pass" {
|
||||
# 3 types of sentinels to test:
|
||||
# (re)started while primary changed pwd. Manage to connect only after setting pwd
|
||||
|
|
|
|||
|
|
@ -8,7 +8,13 @@ foreach_sentinel_id id {
|
|||
S $id sentinel debug publish-period 1000
|
||||
}
|
||||
|
||||
set ::user "sentinel-user"
|
||||
set ::password "sentinel-password"
|
||||
|
||||
|
||||
test "Manual failover works" {
|
||||
configure_sentinel_user_acl $::user $::password
|
||||
|
||||
set old_port [RPort $master_id]
|
||||
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
|
||||
assert {[lindex $addr 1] == $old_port}
|
||||
|
|
@ -60,6 +66,7 @@ test "The old primary eventually gets reconfigured as a slave" {
|
|||
} else {
|
||||
fail "Old master not reconfigured as slave of new master"
|
||||
}
|
||||
reset_sentinel_user_acl $::user
|
||||
}
|
||||
|
||||
foreach flag {crash-after-election crash-after-promotion} {
|
||||
|
|
|
|||
|
|
@ -2,12 +2,17 @@
|
|||
|
||||
source "../tests/includes/init-tests.tcl"
|
||||
|
||||
set ::user "sentinel-user"
|
||||
set ::password "sentinel-password"
|
||||
|
||||
foreach_sentinel_id id {
|
||||
S $id sentinel debug info-period 2000
|
||||
S $id sentinel debug publish-period 1000
|
||||
}
|
||||
|
||||
test "Manual coordinated failover works" {
|
||||
configure_sentinel_user_acl $::user $::password
|
||||
|
||||
set old_port [RPort $master_id]
|
||||
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
|
||||
assert {[lindex $addr 1] == $old_port}
|
||||
|
|
@ -133,6 +138,7 @@ test "No change after failed failover: All sentinels agree on primary" {
|
|||
foreach_sentinel_id id {
|
||||
assert {[lindex [S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] 1] == [lindex $addr 1]}
|
||||
}
|
||||
reset_sentinel_user_acl $::user
|
||||
}
|
||||
|
||||
foreach flag {crash-after-election crash-after-promotion} {
|
||||
|
|
|
|||
|
|
@ -34,3 +34,60 @@ proc verify_sentinel_auto_discovery {} {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
proc verify_sentinel_connect_replicas {id} {
|
||||
foreach replica [S $id SENTINEL REPLICAS mymaster] {
|
||||
if {[string match "*disconnected*" [dict get $replica flags]]} {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return 1
|
||||
}
|
||||
|
||||
proc wait_for_sentinels_connect_servers { {is_connect 1} } {
|
||||
foreach_sentinel_id id {
|
||||
wait_for_condition 1000 50 {
|
||||
[string match "*disconnected*" [dict get [S $id SENTINEL PRIMARY mymaster] flags]] != $is_connect
|
||||
} else {
|
||||
fail "At least some sentinel can't connect to master"
|
||||
}
|
||||
|
||||
wait_for_condition 1000 50 {
|
||||
[verify_sentinel_connect_replicas $id] == $is_connect
|
||||
} else {
|
||||
fail "At least some sentinel can't connect to replica"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
proc configure_sentinel_user_acl {user password {allow_failover 1}} {
|
||||
foreach_valkey_id id {
|
||||
R $id ACL SETUSER $user >$password +subscribe +publish [expr {$allow_failover ? "+failover" : "-failover"}] +script|kill +ping +info +multi +slaveof +config +client +exec &__sentinel__:hello ON
|
||||
# Ensure default user cannot be used for failover
|
||||
R $id ACL SETUSER default -failover -slaveof
|
||||
}
|
||||
foreach_sentinel_id id {
|
||||
S $id SENTINEL SET mymaster auth-user $user
|
||||
S $id SENTINEL SET mymaster auth-pass $password
|
||||
S $id SENTINEL FLUSHCONFIG
|
||||
}
|
||||
foreach_valkey_id id {
|
||||
R $id CLIENT KILL USER default SKIPME yes
|
||||
}
|
||||
wait_for_sentinels_connect_servers
|
||||
}
|
||||
|
||||
proc reset_sentinel_user_acl {user} {
|
||||
foreach_sentinel_id id {
|
||||
S $id SENTINEL SET mymaster auth-user ""
|
||||
S $id SENTINEL SET mymaster auth-pass ""
|
||||
S $id SENTINEL FLUSHCONFIG
|
||||
}
|
||||
foreach_valkey_id id {
|
||||
R $id ACL SETUSER default +failover +slaveof
|
||||
R $id ACL DELUSER $user
|
||||
R $id CONFIG REWRITE
|
||||
}
|
||||
wait_for_sentinels_connect_servers
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue