valkey/tests/integration/replication-psync.tcl
antirez e47c8e3f6a Test: fix, hopefully, false PSYNC failure like in issue #2715.
And many other related Github issues... all reporting the same problem.
There was probably just not enough backlog in certain unlucky runs.
I'll ask people that can reporduce if they see now this as fixed as
well.
2017-04-18 16:17:42 +02:00

129 lines
4.9 KiB
Tcl

proc start_bg_complex_data {host port db ops} {
set tclsh [info nameofexecutable]
exec $tclsh tests/helpers/bg_complex_data.tcl $host $port $db $ops &
}
proc stop_bg_complex_data {handle} {
catch {exec /bin/kill -9 $handle}
}
# Creates a master-slave pair and breaks the link continuously to force
# partial resyncs attempts, all this while flooding the master with
# write queries.
#
# You can specifiy backlog size, ttl, delay before reconnection, test duration
# in seconds, and an additional condition to verify at the end.
#
# If reconnect is > 0, the test actually try to break the connection and
# reconnect with the master, otherwise just the initial synchronization is
# checked for consistency.
proc test_psync {descr duration backlog_size backlog_ttl delay cond diskless reconnect} {
start_server {tags {"repl"}} {
start_server {} {
set master [srv -1 client]
set master_host [srv -1 host]
set master_port [srv -1 port]
set slave [srv 0 client]
$master config set repl-backlog-size $backlog_size
$master config set repl-backlog-ttl $backlog_ttl
$master config set repl-diskless-sync $diskless
$master config set repl-diskless-sync-delay 1
set load_handle0 [start_bg_complex_data $master_host $master_port 9 100000]
set load_handle1 [start_bg_complex_data $master_host $master_port 11 100000]
set load_handle2 [start_bg_complex_data $master_host $master_port 12 100000]
test {Slave should be able to synchronize with the master} {
$slave slaveof $master_host $master_port
wait_for_condition 50 100 {
[lindex [r role] 0] eq {slave} &&
[lindex [r role] 3] eq {connected}
} else {
fail "Replication not started."
}
}
# Check that the background clients are actually writing.
test {Detect write load to master} {
wait_for_condition 50 1000 {
[$master dbsize] > 100
} else {
fail "Can't detect write load from background clients."
}
}
test "Test replication partial resync: $descr (diskless: $diskless, reconnect: $reconnect)" {
# Now while the clients are writing data, break the maste-slave
# link multiple times.
if ($reconnect) {
for {set j 0} {$j < $duration*10} {incr j} {
after 100
# catch {puts "MASTER [$master dbsize] keys, SLAVE [$slave dbsize] keys"}
if {($j % 20) == 0} {
catch {
if {$delay} {
$slave multi
$slave client kill $master_host:$master_port
$slave debug sleep $delay
$slave exec
} else {
$slave client kill $master_host:$master_port
}
}
}
}
}
stop_bg_complex_data $load_handle0
stop_bg_complex_data $load_handle1
stop_bg_complex_data $load_handle2
set retry 10
while {$retry && ([$master debug digest] ne [$slave debug digest])}\
{
after 1000
incr retry -1
}
assert {[$master dbsize] > 0}
if {[$master debug digest] ne [$slave debug digest]} {
set csv1 [csvdump r]
set csv2 [csvdump {r -1}]
set fd [open /tmp/repldump1.txt w]
puts -nonewline $fd $csv1
close $fd
set fd [open /tmp/repldump2.txt w]
puts -nonewline $fd $csv2
close $fd
puts "Master - Slave inconsistency"
puts "Run diff -u against /tmp/repldump*.txt for more info"
}
assert_equal [r debug digest] [r -1 debug digest]
eval $cond
}
}
}
}
foreach diskless {no yes} {
test_psync {no reconnection, just sync} 6 1000000 3600 0 {
} $diskless 0
test_psync {ok psync} 6 100000000 3600 0 {
assert {[s -1 sync_partial_ok] > 0}
} $diskless 1
test_psync {no backlog} 6 100 3600 0.5 {
assert {[s -1 sync_partial_err] > 0}
} $diskless 1
test_psync {ok after delay} 3 100000000 3600 3 {
assert {[s -1 sync_partial_ok] > 0}
} $diskless 1
test_psync {backlog expired} 3 100000000 1 3 {
assert {[s -1 sync_partial_err] > 0}
} $diskless 1
}