make replication tests more stable on slow machines

solving few replication related tests race conditions which fail on slow machines

bugfix in slave buffers test: since the test is executed twice, each time with
a different commands count, the threshold for the delta can't be a constant.
This commit is contained in:
Oran Agra 2019-05-05 08:19:52 +03:00
parent 0a6090bfd8
commit ba809f26d4
3 changed files with 34 additions and 4 deletions

View File

@ -166,12 +166,15 @@ start_server {} {
# Pick a random slave
set slave_id [expr {($master_id+1)%5}]
set sync_count [status $R($master_id) sync_full]
set sync_partial [status $R($master_id) sync_partial_ok]
catch {
$R($slave_id) config rewrite
$R($slave_id) debug restart
}
# note: just waiting for connected_slaves==4 has a race condition since
# we might do the check before the master realized that the slave disconnected
wait_for_condition 50 1000 {
[status $R($master_id) connected_slaves] == 4
[status $R($master_id) sync_partial_ok] == $sync_partial + 1
} else {
fail "Replica not reconnecting"
}

View File

@ -79,6 +79,32 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless rec
stop_bg_complex_data $load_handle0
stop_bg_complex_data $load_handle1
stop_bg_complex_data $load_handle2
# Wait for the slave to reach the "online"
# state from the POV of the master.
set retry 5000
while {$retry} {
set info [$master info]
if {[string match {*slave0:*state=online*} $info]} {
break
} else {
incr retry -1
after 100
}
}
if {$retry == 0} {
error "assertion:Slave not correctly synchronized"
}
# Wait that slave acknowledge it is online so
# we are sure that DBSIZE and DEBUG DIGEST will not
# fail because of timing issues. (-LOADING error)
wait_for_condition 5000 100 {
[lindex [$slave role] 3] eq {connected}
} else {
fail "Slave still not connected after some time"
}
set retry 10
while {$retry && ([$master debug digest] ne [$slave debug digest])}\
{

View File

@ -161,7 +161,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
}
# make sure master doesn't disconnect slave because of timeout
$master config set repl-timeout 300 ;# 5 minutes
$master config set repl-timeout 1200 ;# 20 minutes (for valgrind and slow machines)
$master config set maxmemory-policy allkeys-random
$master config set client-output-buffer-limit "replica 100000000 100000000 300"
$master config set repl-backlog-size [expr {10*1024}]
@ -212,7 +212,8 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
assert {[$master dbsize] == 100}
assert {$slave_buf > 2*1024*1024} ;# some of the data may have been pushed to the OS buffers
assert {$delta < 50*1024 && $delta > -50*1024} ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
set delta_max [expr {$cmd_count / 2}] ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
assert {$delta < $delta_max && $delta > -$delta_max}
$master client kill type slave
set killed_used [s -1 used_memory]
@ -221,7 +222,7 @@ proc test_slave_buffers {test_name cmd_count payload_len limit_memory pipeline}
set killed_used_no_repl [expr {$killed_used - $killed_mem_not_counted_for_evict}]
set delta_no_repl [expr {$killed_used_no_repl - $used_no_repl}]
assert {$killed_slave_buf == 0}
assert {$delta_no_repl > -50*1024 && $delta_no_repl < 50*1024} ;# 1 byte unaccounted for, with 1M commands will consume some 1MB
assert {$delta_no_repl > -$delta_max && $delta_no_repl < $delta_max}
}
# unfreeze slave process (after the 'test' succeeded or failed, but before we attempt to terminate the server