make replication tests more stable on slow machines

solving few replication related tests race conditions which fail on slow machines bugfix in slave buffers test: since the test is executed twice, each time with a different commands count, the threshold for the delta can't be a constant.
2024-11-22 18:54:58 +00:00 · 2019-05-05 08:19:52 +03:00 · 2019-05-05 08:19:52 +03:00 · ba809f26d4
commit ba809f26d4
parent 0a6090bfd8
3 changed files with 34 additions and 4 deletions
--- a/tests/integration/psync2.tcl
+++ b/tests/integration/psync2.tcl
@ -166,12 +166,15 @@ start_server {} {
        # Pick a random slave
        set slave_id [expr {($master_id+1)%5}]
        set sync_count [status $R($master_id) sync_full]
+        set sync_partial [status $R($master_id) sync_partial_ok]
        catch {
            $R($slave_id) config rewrite
            $R($slave_id) debug restart
        }
+        # note: just waiting for connected_slaves==4 has a race condition since
+        # we might do the check before the master realized that the slave disconnected
        wait_for_condition 50 1000 {
-            [status $R($master_id) connected_slaves] == 4
+            [status $R($master_id) sync_partial_ok] == $sync_partial + 1
        } else {
            fail "Replica not reconnecting"
        }
--- a/tests/integration/replication-psync.tcl
+++ b/tests/integration/replication-psync.tcl
@ -79,6 +79,32 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
                stop_bg_complex_data $load_handle0
                stop_bg_complex_data $load_handle1
                stop_bg_complex_data $load_handle2
+
+                # Wait for the slave to reach the "online"
+                # state from the POV of the master.
+                set retry 5000
+                while {$retry} {
+                    set info [$master info]
+                    if {[string match {*slave0:*state=online*} $info]} {
+                        break
+                    } else {
+                        incr retry -1
+                        after 100
+                    }
+                }
+                if {$retry == 0} {
+                    error "assertion:Slave not correctly synchronized"
+                }
+
+                # Wait that slave acknowledge it is online so
+                # we are sure that DBSIZE and DEBUG DIGEST will not
+                # fail because of timing issues. (-LOADING error)
+                wait_for_condition 5000 100 {
+                    [lindex [$slave role] 3] eq {connected}
+                } else {
+                    fail "Slave still not connected after some time"
+                }  
+
                set retry 10
                while {$retry && ([$master debug digest] ne [$slave debug digest])}\
                {
--- a/tests/unit/maxmemory.tcl
+++ b/tests/unit/maxmemory.tcl
@ -161,7 +161,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
            }

            # make sure master doesn't disconnect slave because of timeout
-            $master config set repl-timeout 300 ;# 5 minutes
+            $master config set repl-timeout 1200 ;# 20 minutes (for valgrind and slow machines)
            $master config set maxmemory-policy allkeys-random
            $master config set client-output-buffer-limit "replica 100000000 100000000 300"
            $master config set repl-backlog-size [expr {10*1024}]
@ -212,7 +212,8 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}

            assert {[$master dbsize] == 100}
            assert {$slave_buf > 2*1024*1024} ;# some of the data may have been pushed to the OS buffers
-            assert {$delta < 50*1024 && $delta > -50*1024} ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
+            set delta_max [expr {$cmd_count / 2}] ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
+            assert {$delta < $delta_max && $delta > -$delta_max}

            $master client kill type slave
            set killed_used [s -1 used_memory]
@ -221,7 +222,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
            set killed_used_no_repl [expr {$killed_used - $killed_mem_not_counted_for_evict}]
            set delta_no_repl [expr {$killed_used_no_repl - $used_no_repl}]
            assert {$killed_slave_buf == 0}
-            assert {$delta_no_repl > -50*1024 && $delta_no_repl < 50*1024} ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
+            assert {$delta_no_repl > -$delta_max && $delta_no_repl < $delta_max}

        }
        # unfreeze slave process (after the 'test' succeeded or failed, but before we attempt to terminate the server