Configurable cluster blacklist TTL (#738)

Allows cluster admins to configure the blacklist TTL as needed to allow
sufficient time for `CLUSTER FORGET` to be executed on every node in the
cluster.

Config name `cluster-blacklist-ttl`; unit seconds; deault 60.

---------

Signed-off-by: Brennan Cathcart <brennancathcart@gmail.com>
This commit is contained in:
Brennan 2024-07-13 11:38:25 -07:00 committed by GitHub
parent b4ac2c406c
commit 34649bd034
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 17 additions and 7 deletions

View File

@ -1842,14 +1842,14 @@ void clusterHandleConfigEpochCollision(clusterNode *sender) {
*
* The nodes blacklist is just a way to ensure that a given node with a given
* Node ID is not re-added before some time elapsed (this time is specified
* in seconds in CLUSTER_BLACKLIST_TTL).
* in seconds by the configurable cluster-blacklist-ttl).
*
* This is useful when we want to remove a node from the cluster completely:
* when CLUSTER FORGET is called, it also puts the node into the blacklist so
* that even if we receive gossip messages from other nodes that still remember
* about the node we want to remove, we don't re-add it before some time.
*
* Currently the CLUSTER_BLACKLIST_TTL is set to 1 minute, this means
* The default blacklist ttl is 1 minute which means
* that valkey-cli has 60 seconds to send CLUSTER FORGET messages to nodes
* in the cluster without dealing with the problem of other nodes re-adding
* back the node to nodes we already sent the FORGET command to.
@ -1859,9 +1859,6 @@ void clusterHandleConfigEpochCollision(clusterNode *sender) {
* value.
* -------------------------------------------------------------------------- */
#define CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */
/* Before of the addNode() or Exists() operations we always remove expired
* entries from the black list. This is an O(N) operation but it is not a
* problem since add / exists operations are called very infrequently and
@ -1893,7 +1890,7 @@ void clusterBlacklistAddNode(clusterNode *node) {
id = sdsdup(id);
}
de = dictFind(server.cluster->nodes_black_list, id);
dictSetUnsignedIntegerVal(de, time(NULL) + CLUSTER_BLACKLIST_TTL);
dictSetUnsignedIntegerVal(de, time(NULL) + server.cluster_blacklist_ttl);
sdsfree(id);
}

View File

@ -3211,6 +3211,7 @@ standardConfig static_configs[] = {
createULongConfig("active-defrag-max-scan-fields", NULL, MODIFIABLE_CONFIG, 1, LONG_MAX, server.active_defrag_max_scan_fields, 1000, INTEGER_CONFIG, NULL, NULL), /* Default: keys with more than 1000 fields will be processed separately */
createULongConfig("slowlog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.slowlog_max_len, 128, INTEGER_CONFIG, NULL, NULL),
createULongConfig("acllog-max-len", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX, server.acllog_max_len, 128, INTEGER_CONFIG, NULL, NULL),
createULongConfig("cluster-blacklist-ttl", NULL, MODIFIABLE_CONFIG, 0, ULONG_MAX, server.cluster_blacklist_ttl, 60, INTEGER_CONFIG, NULL, NULL),
/* Long Long configs */
createLongLongConfig("busy-reply-threshold", "lua-time-limit", MODIFIABLE_CONFIG, 0, LONG_MAX, server.busy_reply_threshold, 5000, INTEGER_CONFIG, NULL, NULL), /* milliseconds */

View File

@ -2079,6 +2079,8 @@ struct valkeyServer {
unsigned long long cluster_link_msg_queue_limit_bytes; /* Memory usage limit on individual link msg queue */
int cluster_drop_packet_filter; /* Debug config that allows tactically
* dropping packets of a specific type */
unsigned long cluster_blacklist_ttl; /* Duration in seconds that a node is denied re-entry into
* the cluster after it is forgotten with CLUSTER FORGET. */
/* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */
uint32_t debug_cluster_close_link_on_packet_drop : 1;
sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */

View File

@ -1760,6 +1760,16 @@ aof-timestamp-enabled no
#
# cluster-preferred-endpoint-type ip
# The cluster blacklist is used when removing a node from the cluster completely.
# When CLUSTER FORGET is called for a node, that node is put into the blacklist for
# some time so that when gossip messages are received from other nodes that still
# remember it, it is not re-added. This gives time for CLUSTER FORGET to be sent to
# every node in the cluster. The blacklist TTL is 60 seconds by default, which should
# be sufficient for most clusters, but you may considering increasing this if you see
# nodes getting re-added while using CLUSTER FORGET.
#
# cluster-blacklist-ttl 60
# In order to setup your cluster make sure to read the documentation
# available at https://valkey.io web site.
@ -2321,4 +2331,4 @@ jemalloc-bg-thread yes
# this is only exposed via the info command for clients to use, but in the future we
# we may also use this when making decisions for replication.
#
# availability-zone "zone-name"
# availability-zone "zone-name"