Trigger the election immediately when doing a manual failover (#1081)

Currently when a manual failover is triggeded, we will set a CLUSTER_TODO_HANDLE_FAILOVER to start the election as soon as possible in the next beforeSleep. But in fact, we won't delay the election in manual failover, waitting for the next beforeSleep to kick in will delay the election a some milliseconds. We can trigger the election immediately in this case in the same function call, without waitting for beforeSleep, which can save us some milliseconds. Signed-off-by: Binbin <binloveplay1314@qq.com>
2024-11-22 00:52:38 +00:00 · 2024-11-11 21:43:46 +08:00 · 2024-11-11 21:43:46 +08:00 · 167e8ab8de
commit 167e8ab8de
parent 4aacffa32d
1 changed files with 14 additions and 7 deletions
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@ -4519,8 +4519,9 @@ void clusterFailoverReplaceYourPrimary(void) {
 * 3) Perform the failover informing all the other nodes.
 */
 void clusterHandleReplicaFailover(void) {
+    mstime_t now = mstime();
    mstime_t data_age;
-    mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
+    mstime_t auth_age = now - server.cluster->failover_auth_time;
    int needed_quorum = (server.cluster->size / 2) + 1;
    int manual_failover = server.cluster->mf_end != 0 && server.cluster->mf_can_start;
    mstime_t auth_timeout, auth_retry_time;
@ -4582,7 +4583,7 @@ void clusterHandleReplicaFailover(void) {
    /* If the previous failover attempt timeout and the retry time has
     * elapsed, we can setup a new one. */
    if (auth_age > auth_retry_time) {
-        server.cluster->failover_auth_time = mstime() +
+        server.cluster->failover_auth_time = now +
                                             500 +           /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
                                             random() % 500; /* Random delay between 0 and 500 milliseconds. */
        server.cluster->failover_auth_count = 0;
@ -4594,20 +4595,26 @@ void clusterHandleReplicaFailover(void) {
        server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000;
        /* However if this is a manual failover, no delay is needed. */
        if (server.cluster->mf_end) {
-            server.cluster->failover_auth_time = mstime();
+            server.cluster->failover_auth_time = now;
            server.cluster->failover_auth_rank = 0;
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+            /* Reset auth_age since it is outdated now and we can bypass the auth_timeout
+             * check in the next state and start the election ASAP. */
+            auth_age = 0;
        }
        serverLog(LL_NOTICE,
                  "Start of election delayed for %lld milliseconds "
                  "(rank #%d, offset %lld).",
-                  server.cluster->failover_auth_time - mstime(), server.cluster->failover_auth_rank,
+                  server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank,
                  replicationGetReplicaOffset());
        /* Now that we have a scheduled election, broadcast our offset
         * to all the other replicas so that they'll updated their offsets
         * if our offset is better. */
        clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_REPLICAS);
-        return;
+
+        /* Return ASAP if we can't start the election now. In a manual failover,
+         * we can start the election immediately, so in this case we continue to
+         * the next state without waiting for the next beforeSleep. */
+        if (now < server.cluster->failover_auth_time) return;
    }

    /* It is possible that we received more updated offsets from other
@ -4627,7 +4634,7 @@ void clusterHandleReplicaFailover(void) {
    }

    /* Return ASAP if we can't still start the election. */
-    if (mstime() < server.cluster->failover_auth_time) {
+    if (now < server.cluster->failover_auth_time) {
        clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY);
        return;
    }