mirror of
http://github.com/valkey-io/valkey
synced 2024-11-22 09:17:20 +00:00
28e055af0b
Deflake chained replicas disconnect when replica re-connect with the same master. sync_partial_ok counter might get incremented if replica timed out during test. Signed-off-by: naglera <anagler123@gmail.com>
252 lines
9.5 KiB
Tcl
252 lines
9.5 KiB
Tcl
# These tests were added together with the meaningful offset implementation
|
|
# in valkey OSS 6.0.0, which was later abandoned in 6.0.4, they used to test that
|
|
# servers are able to PSYNC with replicas even if the replication stream has
|
|
# PINGs at the end which present in one sever and missing on another.
|
|
# We keep these tests just because they reproduce edge cases in the replication
|
|
# logic in hope they'll be able to spot some problem in the future.
|
|
|
|
start_server {tags {"psync2 external:skip"}} {
|
|
start_server {} {
|
|
# Config
|
|
set debug_msg 0 ; # Enable additional debug messages
|
|
|
|
for {set j 0} {$j < 2} {incr j} {
|
|
set R($j) [srv [expr 0-$j] client]
|
|
set R_host($j) [srv [expr 0-$j] host]
|
|
set R_port($j) [srv [expr 0-$j] port]
|
|
$R($j) CONFIG SET repl-ping-replica-period 1
|
|
if {$debug_msg} {puts "Log file: [srv [expr 0-$j] stdout]"}
|
|
}
|
|
|
|
# Setup replication
|
|
test "PSYNC2 pingoff: setup" {
|
|
$R(1) replicaof $R_host(0) $R_port(0)
|
|
$R(0) set foo bar
|
|
wait_for_condition 50 1000 {
|
|
[status $R(1) master_link_status] == "up" &&
|
|
[$R(0) dbsize] == 1 && [$R(1) dbsize] == 1
|
|
} else {
|
|
fail "Replicas not replicating from master"
|
|
}
|
|
}
|
|
|
|
test "PSYNC2 pingoff: write and wait replication" {
|
|
$R(0) INCR counter
|
|
$R(0) INCR counter
|
|
$R(0) INCR counter
|
|
wait_for_condition 50 1000 {
|
|
[$R(0) GET counter] eq [$R(1) GET counter]
|
|
} else {
|
|
fail "Master and replica don't agree about counter"
|
|
}
|
|
}
|
|
|
|
# In this test we'll make sure the replica will get stuck, but with
|
|
# an active connection: this way the master will continue to send PINGs
|
|
# every second (we modified the PING period earlier)
|
|
test "PSYNC2 pingoff: pause replica and promote it" {
|
|
$R(1) MULTI
|
|
$R(1) DEBUG SLEEP 5
|
|
$R(1) SLAVEOF NO ONE
|
|
$R(1) EXEC
|
|
$R(1) ping ; # Wait for it to return back available
|
|
}
|
|
|
|
test "Make the old master a replica of the new one and check conditions" {
|
|
# We set the new master's ping period to a high value, so that there's
|
|
# no chance for a race condition of sending a PING in between the two
|
|
# INFO calls in the assert for master_repl_offset match below.
|
|
$R(1) CONFIG SET repl-ping-replica-period 1000
|
|
|
|
assert_equal [status $R(1) sync_full] 0
|
|
$R(0) REPLICAOF $R_host(1) $R_port(1)
|
|
|
|
wait_for_condition 50 1000 {
|
|
[status $R(0) master_link_status] == "up"
|
|
} else {
|
|
fail "The new master was not able to sync"
|
|
}
|
|
|
|
# make sure replication is still alive and kicking
|
|
$R(1) incr x
|
|
wait_for_condition 50 1000 {
|
|
[status $R(0) loading] == 0 &&
|
|
[$R(0) get x] == 1
|
|
} else {
|
|
fail "replica didn't get incr"
|
|
}
|
|
assert_equal [status $R(0) master_repl_offset] [status $R(1) master_repl_offset]
|
|
}
|
|
}}
|
|
|
|
|
|
start_server {tags {"psync2 external:skip"}} {
|
|
start_server {} {
|
|
start_server {} {
|
|
start_server {} {
|
|
start_server {} {
|
|
test {test various edge cases of repl topology changes with missing pings at the end} {
|
|
set master [srv -4 client]
|
|
set master_host [srv -4 host]
|
|
set master_port [srv -4 port]
|
|
set replica1 [srv -3 client]
|
|
set replica2 [srv -2 client]
|
|
set replica3 [srv -1 client]
|
|
set replica4 [srv -0 client]
|
|
|
|
$replica1 replicaof $master_host $master_port
|
|
$replica2 replicaof $master_host $master_port
|
|
$replica3 replicaof $master_host $master_port
|
|
$replica4 replicaof $master_host $master_port
|
|
wait_for_condition 50 1000 {
|
|
[status $master connected_slaves] == 4
|
|
} else {
|
|
fail "replicas didn't connect"
|
|
}
|
|
|
|
$master incr x
|
|
wait_for_condition 50 1000 {
|
|
[$replica1 get x] == 1 && [$replica2 get x] == 1 &&
|
|
[$replica3 get x] == 1 && [$replica4 get x] == 1
|
|
} else {
|
|
fail "replicas didn't get incr"
|
|
}
|
|
|
|
# disconnect replica1 and replica2
|
|
# and wait for the master to send a ping to replica3 and replica4
|
|
$replica1 replicaof no one
|
|
$replica2 replicaof 127.0.0.1 1 ;# we can't promote it to master since that will cycle the replication id
|
|
$master config set repl-ping-replica-period 1
|
|
set replofs [status $master master_repl_offset]
|
|
wait_for_condition 50 100 {
|
|
[status $replica3 master_repl_offset] > $replofs &&
|
|
[status $replica4 master_repl_offset] > $replofs
|
|
} else {
|
|
fail "replica didn't sync in time"
|
|
}
|
|
|
|
# make everyone sync from the replica1 that didn't get the last ping from the old master
|
|
# replica4 will keep syncing from the old master which now syncs from replica1
|
|
# and replica2 will re-connect to the old master (which went back in time)
|
|
set new_master_host [srv -3 host]
|
|
set new_master_port [srv -3 port]
|
|
$replica3 replicaof $new_master_host $new_master_port
|
|
$master replicaof $new_master_host $new_master_port
|
|
$replica2 replicaof $master_host $master_port
|
|
wait_for_condition 50 1000 {
|
|
[status $replica2 master_link_status] == "up" &&
|
|
[status $replica3 master_link_status] == "up" &&
|
|
[status $replica4 master_link_status] == "up" &&
|
|
[status $master master_link_status] == "up"
|
|
} else {
|
|
fail "replicas didn't connect"
|
|
}
|
|
|
|
# make sure replication is still alive and kicking
|
|
$replica1 incr x
|
|
wait_for_condition 50 1000 {
|
|
[$replica2 get x] == 2 &&
|
|
[$replica3 get x] == 2 &&
|
|
[$replica4 get x] == 2 &&
|
|
[$master get x] == 2
|
|
} else {
|
|
fail "replicas didn't get incr"
|
|
}
|
|
|
|
# make sure we have the right amount of full syncs
|
|
assert_equal [status $master sync_full] 6
|
|
assert_equal [status $replica1 sync_full] 2
|
|
assert_equal [status $replica2 sync_full] 0
|
|
assert_equal [status $replica3 sync_full] 0
|
|
assert_equal [status $replica4 sync_full] 0
|
|
|
|
# force psync
|
|
$master client kill type master
|
|
$replica2 client kill type master
|
|
$replica3 client kill type master
|
|
$replica4 client kill type master
|
|
|
|
# make sure replication is still alive and kicking
|
|
$replica1 incr x
|
|
wait_for_condition 50 1000 {
|
|
[$replica2 get x] == 3 &&
|
|
[$replica3 get x] == 3 &&
|
|
[$replica4 get x] == 3 &&
|
|
[$master get x] == 3
|
|
} else {
|
|
fail "replicas didn't get incr"
|
|
}
|
|
|
|
# make sure we have the right amount of full syncs
|
|
assert_equal [status $master sync_full] 6
|
|
assert_equal [status $replica1 sync_full] 2
|
|
assert_equal [status $replica2 sync_full] 0
|
|
assert_equal [status $replica3 sync_full] 0
|
|
assert_equal [status $replica4 sync_full] 0
|
|
}
|
|
}}}}}
|
|
|
|
start_server {tags {"psync2 external:skip"}} {
|
|
start_server {} {
|
|
start_server {} {
|
|
|
|
for {set j 0} {$j < 3} {incr j} {
|
|
set R($j) [srv [expr 0-$j] client]
|
|
set R_host($j) [srv [expr 0-$j] host]
|
|
set R_port($j) [srv [expr 0-$j] port]
|
|
$R($j) CONFIG SET repl-ping-replica-period 1
|
|
$R($j) config set repl-timeout 300
|
|
}
|
|
|
|
test "Chained replicas disconnect when replica re-connect with the same master" {
|
|
# Add a second replica as a chained replica of the current replica
|
|
$R(1) replicaof $R_host(0) $R_port(0)
|
|
$R(2) replicaof $R_host(1) $R_port(1)
|
|
wait_for_condition 50 1000 {
|
|
[status $R(2) master_link_status] == "up"
|
|
} else {
|
|
fail "Chained replica not replicating from its master"
|
|
}
|
|
|
|
# Do a write on the master, and wait for the master to
|
|
# send some PINGs to its replica
|
|
$R(0) INCR counter2
|
|
set replofs [status $R(0) master_repl_offset]
|
|
wait_for_condition 50 100 {
|
|
[status $R(1) master_repl_offset] > $replofs &&
|
|
[status $R(2) master_repl_offset] > $replofs
|
|
} else {
|
|
fail "replica didn't sync in time"
|
|
}
|
|
set sync_partial_master [status $R(0) sync_partial_ok]
|
|
set sync_partial_replica [status $R(1) sync_partial_ok]
|
|
$R(0) CONFIG SET repl-ping-replica-period 100
|
|
|
|
# Disconnect the master's direct replica
|
|
$R(0) client kill type replica
|
|
wait_for_condition 50 1000 {
|
|
[status $R(1) master_link_status] == "up" &&
|
|
[status $R(2) master_link_status] == "up" &&
|
|
[status $R(0) sync_partial_ok] == $sync_partial_master + 1 &&
|
|
[status $R(1) sync_partial_ok] == $sync_partial_replica
|
|
} else {
|
|
fail "Disconnected replica failed to PSYNC with master"
|
|
}
|
|
|
|
# Verify that the replica and its replica's meaningful and real
|
|
# offsets match with the master
|
|
assert_equal [status $R(0) master_repl_offset] [status $R(1) master_repl_offset]
|
|
assert_equal [status $R(0) master_repl_offset] [status $R(2) master_repl_offset]
|
|
|
|
# make sure replication is still alive and kicking
|
|
$R(0) incr counter2
|
|
wait_for_condition 50 1000 {
|
|
[$R(1) get counter2] == 2 && [$R(2) get counter2] == 2
|
|
} else {
|
|
fail "replicas didn't get incr"
|
|
}
|
|
assert_equal [status $R(0) master_repl_offset] [status $R(1) master_repl_offset]
|
|
assert_equal [status $R(0) master_repl_offset] [status $R(2) master_repl_offset]
|
|
}
|
|
}}}
|