Wait for cluster fully online in cluster_config_consistent (#272)

Wait for cluster to be in a fully consistent and online state in
`cluster_config_consistent`. We expect the `start_server` to create the
desired primaries and replicas before the start of the tests. With the
current setup, the replicas may not complete the sync with primaries and
can be in loading state. In some cases, the role of replicas can still
be master with the delay of propagation of replicate command. The tests
can show flaky behavior in such cases. Add a check that verifies the
nodes health status 'online' for the cluster consistency. Leverage the
deterministic order of `CLUSTER SLOTS` to consider the cluster as
consistent along with the nodes health status.

---------

Signed-off-by: Harkrishn Patro <harkrisp@amazon.com>
Signed-off-by: Ram Prasad Voleti <ramvolet@amazon.com>
Co-authored-by: Harkrishn Patro <harkrisp@amazon.com>
Co-authored-by: Ram Prasad Voleti <ramvolet@amazon.com>
This commit is contained in:
VoletiRam 2024-04-08 20:03:56 -07:00 committed by GitHub
parent e59dd41e42
commit d89ef06ce5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 2 deletions

View File

@ -182,8 +182,21 @@ proc cluster_write_test {id} {
}
# Check if cluster configuration is consistent.
# All the nodes in the cluster should show same slots configuration and have health
# state "online" to be considered as consistent.
proc cluster_config_consistent {} {
for {set j 0} {$j < $::cluster_master_nodes + $::cluster_replica_nodes} {incr j} {
# Check if all the nodes are online
set shards_cfg [R $j CLUSTER SHARDS]
foreach shard_cfg $shards_cfg {
set nodes [dict get $shard_cfg nodes]
foreach node $nodes {
if {[dict get $node health] ne "online"} {
return 0
}
}
}
if {$j == 0} {
set base_cfg [R $j cluster slots]
} else {
@ -199,7 +212,7 @@ proc cluster_config_consistent {} {
# Wait for cluster configuration to propagate and be consistent across nodes.
proc wait_for_cluster_propagation {} {
wait_for_condition 50 100 {
wait_for_condition 1000 50 {
[cluster_config_consistent] eq 1
} else {
fail "cluster config did not reach a consistent state"

View File

@ -1,8 +1,21 @@
# Cluster helper functions
# Check if cluster configuration is consistent.
# All the nodes in the cluster should show same slots configuration and have health
# state "online" to be considered as consistent.
proc cluster_config_consistent {} {
for {set j 0} {$j < [llength $::servers]} {incr j} {
# Check if all the nodes are online
set shards_cfg [R $j CLUSTER SHARDS]
foreach shard_cfg $shards_cfg {
set nodes [dict get $shard_cfg nodes]
foreach node $nodes {
if {[dict get $node health] ne "online"} {
return 0
}
}
}
if {$j == 0} {
set base_cfg [R $j cluster slots]
} else {
@ -27,7 +40,7 @@ proc cluster_size_consistent {cluster_size} {
# Wait for cluster configuration to propagate and be consistent across nodes.
proc wait_for_cluster_propagation {} {
wait_for_condition 50 100 {
wait_for_condition 1000 50 {
[cluster_config_consistent] eq 1
} else {
fail "cluster config did not reach a consistent state"