Fix primary crash when processing dirty slots during shutdown wait / failover wait / client pause (#1131)

We have an assert in propagateNow. If the primary node receives a
CLUSTER UPDATE such as dirty slots during SIGTERM waitting or during
a manual failover pausing or during a client pause, the delKeysInSlot
call will trigger this assert and cause primary crash.

In this case, we added a new server_del_keys_in_slot state just like
client_pause_in_transaction to track the state to avoid the assert
in propagateNow, the dirty slots will be deleted in the end without
affecting the data consistency.

Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
This commit is contained in:
Binbin 2024-11-15 16:47:15 +08:00 committed by GitHub
parent 4e2493e5c9
commit 92181b6797
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 142 additions and 4 deletions

View File

@ -6084,6 +6084,9 @@ void removeChannelsInSlot(unsigned int slot) {
unsigned int delKeysInSlot(unsigned int hashslot) {
if (!countKeysInSlot(hashslot)) return 0;
/* We may lose a slot during the pause. We need to track this
* state so that we don't assert in propagateNow(). */
server.server_del_keys_in_slot = 1;
unsigned int j = 0;
kvstoreDictIterator *kvs_di = NULL;
@ -6108,6 +6111,8 @@ unsigned int delKeysInSlot(unsigned int hashslot) {
}
kvstoreReleaseDictIterator(kvs_di);
server.server_del_keys_in_slot = 0;
serverAssert(server.execution_nesting == 0);
return j;
}

View File

@ -4571,7 +4571,7 @@ static void pauseClientsByClient(mstime_t endTime, int isPauseClientAll) {
}
/* Pause actions up to the specified unixtime (in ms) for a given type of
* commands.
* purpose.
*
* A main use case of this function is to allow pausing replication traffic
* so that a failover without data loss to occur. Replicas will continue to receive

View File

@ -3315,8 +3315,28 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) {
if (!shouldPropagate(target)) return;
/* This needs to be unreachable since the dataset should be fixed during
* replica pause (otherwise data may be lost during a failover) */
serverAssert(!(isPausedActions(PAUSE_ACTION_REPLICA) && (!server.client_pause_in_transaction)));
* replica pause (otherwise data may be lost during a failover).
*
* Though, there are exceptions:
*
* 1. We allow write commands that were queued up before and after to
* execute, if a CLIENT PAUSE executed during a transaction, we will
* track the state, the CLIENT PAUSE takes effect only after a transaction
* has finished.
* 2. Primary loses a slot during the pause, deletes all keys and replicates
* DEL to its replicas. In this case, we will track the state, the dirty
* slots will be deleted in the end without affecting the data consistency.
*
* Note that case 2 can happen in one of the following scenarios:
* 1) The primary waits for the replica to replicate before exiting, see
* shutdown-timeout in conf for more details. In this case, primary lost
* a slot during the SIGTERM waiting.
* 2) The primary waits for the replica to replicate during a manual failover.
* In this case, primary lost a slot during the pausing.
* 3) The primary was paused by CLIENT PAUSE, and lost a slot during the
* pausing. */
serverAssert(!isPausedActions(PAUSE_ACTION_REPLICA) || server.client_pause_in_transaction ||
server.server_del_keys_in_slot);
if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF) feedAppendOnlyFile(dbid, argv, argc);
if (target & PROPAGATE_REPL) replicationFeedReplicas(dbid, argv, argc);

View File

@ -1701,6 +1701,7 @@ struct valkeyServer {
const char *busy_module_yield_reply; /* When non-null, we are inside RM_Yield. */
char *ignore_warnings; /* Config: warnings that should be ignored. */
int client_pause_in_transaction; /* Was a client pause executed during this Exec? */
int server_del_keys_in_slot; /* The server is deleting the keys in the dirty slot. */
int thp_enabled; /* If true, THP is enabled. */
size_t page_size; /* The page size of OS. */
/* Modules */
@ -2863,7 +2864,7 @@ void flushReplicasOutputBuffers(void);
void disconnectReplicas(void);
void evictClients(void);
int listenToPort(connListener *fds);
void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions_bitmask);
void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions);
void unpauseActions(pause_purpose purpose);
uint32_t isPausedActions(uint32_t action_bitmask);
uint32_t isPausedActionsWithUpdate(uint32_t action_bitmask);

View File

@ -59,3 +59,88 @@ start_cluster 2 2 {tags {external:skip cluster}} {
}
}
}
start_cluster 3 1 {tags {external:skip cluster} overrides {shutdown-timeout 100}} {
test "Primary lost a slot during the shutdown waiting" {
R 0 set FOO 0
# Pause the replica.
pause_process [srv -3 pid]
# Incr the key and immediately shutdown the primary.
# The primary waits for the replica to replicate before exiting.
R 0 incr FOO
exec kill -SIGTERM [srv 0 pid]
wait_for_condition 50 100 {
[s 0 shutdown_in_milliseconds] > 0
} else {
fail "Primary not indicating ongoing shutdown."
}
# Move the slot to other primary
R 1 cluster bumpepoch
R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid]
# Waiting for dirty slot update.
wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10
# Resume the replica and make sure primary exits normally instead of crashing.
resume_process [srv -3 pid]
wait_for_log_messages 0 {"*Valkey is now ready to exit, bye bye*"} 0 1000 10
# Make sure that the replica will become the new primary and does not own the key.
wait_for_condition 1000 50 {
[s -3 role] eq {master}
} else {
fail "The replica was not converted into primary"
}
assert_error {ERR no such key} {R 3 debug object foo}
}
}
start_cluster 3 1 {tags {external:skip cluster}} {
test "Primary lost a slot during the manual failover pausing" {
R 0 set FOO 0
# Set primaries to drop the FAILOVER_AUTH_REQUEST packets, so that
# primary 0 will pause until the failover times out.
R 1 debug drop-cluster-packet-filter 5
R 2 debug drop-cluster-packet-filter 5
# Replica doing the manual failover.
R 3 cluster failover
# Move the slot to other primary
R 1 cluster bumpepoch
R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid]
# Waiting for dirty slot update.
wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10
# Make sure primary doesn't crash when deleting the keys.
R 0 ping
R 1 debug drop-cluster-packet-filter -1
R 2 debug drop-cluster-packet-filter -1
}
}
start_cluster 3 1 {tags {external:skip cluster}} {
test "Primary lost a slot during the client pause command" {
R 0 set FOO 0
R 0 client pause 1000000000 write
# Move the slot to other primary
R 1 cluster bumpepoch
R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid]
# Waiting for dirty slot update.
wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10
# Make sure primary doesn't crash when deleting the keys.
R 0 ping
R 0 client unpause
}
}

View File

@ -260,6 +260,33 @@ start_server {tags {"pause network"}} {
r client unpause
}
test "Test eviction is skipped during client pause" {
r flushall
set evicted_keys [s 0 evicted_keys]
r multi
r set foo{t} bar
r config set maxmemory-policy allkeys-random
r config set maxmemory 1
r client PAUSE 50000 WRITE
r exec
# No keys should actually have been evicted.
assert_match $evicted_keys [s 0 evicted_keys]
# The previous config set triggers a time event, but due to the pause,
# no eviction has been made. After the unpause, a eviction will happen.
r client unpause
wait_for_condition 1000 10 {
[expr $evicted_keys + 1] eq [s 0 evicted_keys]
} else {
fail "Key is not evicted"
}
r config set maxmemory 0
r config set maxmemory-policy noeviction
}
test "Test both active and passive expires are skipped during client pause" {
set expired_keys [s 0 expired_keys]
r multi