diff --git a/redis.conf b/redis.conf index 00a2f9193..fe259726e 100644 --- a/redis.conf +++ b/redis.conf @@ -563,6 +563,51 @@ lua-time-limit 5000 # # cluster-node-timeout 15000 +# A slave of a failing master will avoid to start a failover if its data +# looks too old. +# +# There is no simple way for a slave to actually have a exact measure of +# its "data age", so the following two checks are performed: +# +# 1) If there are multiple slaves able to failover, they exchange messages +# in order to try to give an advantage to the slave with the best +# replication offset (more data from the master processed). +# Slaves will try to get their rank by offset, and apply to the start +# of the failover a delay proportional to their rank. +# +# 2) Every single slave computes the time of the last interaction with +# its master. This can be the last ping or command received (if the master +# is still in the "connected" state), or the time that elapsed since the +# disconnection with the master (if the replication link is currently down). +# If the last interaction is too old, the slave will not try to failover +# at all. +# +# The point "2" can be tuned by user. Specifically a slave will not perform +# the failover if, since the last interaction with the master, the time +# elapsed is greater than: +# +# (node-timeout * slave-validity-factor) + repl-ping-slave-period +# +# So for example if node-timeout is 30 seconds, and the slave-validity-factor +# is 10, and assuming a default repl-ping-slave-period of 10 seconds, the +# slave will not try to failover if it was not able to talk with the master +# for longer than 310 seconds. +# +# A large slave-validity-factor may allow slaves with too old data to failover +# a master, while a too small value may prevent the cluster from being able to +# elect a slave at all. +# +# For maximum availability, it is possible to set the slave-validity-factor +# to a value of 0, which means, that slaves will always try to failover the +# master regardless of the last time they interacted with the master. +# (However they'll always try to apply a delay proportional to their +# offset rank). +# +# Zero is the only value able to guarantee that when all the partitions heal +# the cluster will always be able to continue. +# +# cluster-slave-validity-factor 10 + # Cluster slaves are able to migrate to orphaned masters, that are masters # that are left without working slaves. This improves the cluster ability # to resist to failures as otherwise an orphaned master can't be failed over diff --git a/src/cluster.c b/src/cluster.c index 59193680c..638189dec 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2410,14 +2410,14 @@ void clusterHandleSlaveFailover(void) { if (data_age > server.cluster_node_timeout) data_age -= server.cluster_node_timeout; - /* Check if our data is recent enough. For now we just use a fixed - * constant of ten times the node timeout since the cluster should - * react much faster to a master down. + /* Check if our data is recent enough according to the slave validity + * factor configured by the user. * * Check bypassed for manual failovers. */ - if (data_age > - ((mstime_t)server.repl_ping_slave_period * 1000) + - (server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT)) + if (server.cluster_slave_validity_factor && + data_age > + (((mstime_t)server.repl_ping_slave_period * 1000) + + (server.cluster_node_timeout * server.cluster_slave_validity_factor))) { if (!manual_failover) return; } diff --git a/src/cluster.h b/src/cluster.h index 2d6960574..96072cd91 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -14,10 +14,10 @@ /* The following defines are amunt of time, sometimes expressed as * multiplicators of the node timeout value (when ending with MULT). */ #define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15000 +#define REDIS_CLUSTER_DEFAULT_SLAVE_VALIDITY 10 /* Slave max data age factor. */ #define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ -#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ #define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ #define REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER 1 #define REDIS_CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */ diff --git a/src/config.c b/src/config.c index 25068cb9f..f11120da4 100644 --- a/src/config.c +++ b/src/config.c @@ -434,7 +434,15 @@ void loadServerConfigFromString(char *config) { { server.cluster_migration_barrier = atoi(argv[1]); if (server.cluster_migration_barrier < 0) { - err = "cluster migration barrier must be positive"; + err = "cluster migration barrier must zero or positive"; + goto loaderr; + } + } else if (!strcasecmp(argv[0],"cluster-slave-validity-factor") + && argc == 2) + { + server.cluster_slave_validity_factor = atoi(argv[1]); + if (server.cluster_slave_validity_factor < 0) { + err = "cluster slave validity factor must be zero or positive"; goto loaderr; } } else if (!strcasecmp(argv[0],"lua-time-limit") && argc == 2) { @@ -897,6 +905,10 @@ void configSetCommand(redisClient *c) { if (getLongLongFromObject(o,&ll) == REDIS_ERR || ll < 0) goto badfmt; server.cluster_migration_barrier = ll; + } else if (!strcasecmp(c->argv[2]->ptr,"cluster-slave-validity-factor")) { + if (getLongLongFromObject(o,&ll) == REDIS_ERR || + ll < 0) goto badfmt; + server.cluster_slave_validity_factor = ll; } else { addReplyErrorFormat(c,"Unsupported CONFIG parameter: %s", (char*)c->argv[2]->ptr); @@ -1001,6 +1013,7 @@ void configGetCommand(redisClient *c) { config_get_numerical_field("hz",server.hz); config_get_numerical_field("cluster-node-timeout",server.cluster_node_timeout); config_get_numerical_field("cluster-migration-barrier",server.cluster_migration_barrier); + config_get_numerical_field("cluster-slave-validity-factor",server.cluster_slave_validity_factor); /* Bool (yes/no) values */ config_get_bool_field("no-appendfsync-on-rewrite", @@ -1770,6 +1783,7 @@ int rewriteConfig(char *path) { rewriteConfigStringOption(state,"cluster-config-file",server.cluster_configfile,REDIS_DEFAULT_CLUSTER_CONFIG_FILE); rewriteConfigNumericalOption(state,"cluster-node-timeout",server.cluster_node_timeout,REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT); rewriteConfigNumericalOption(state,"cluster-migration-barrier",server.cluster_migration_barrier,REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER); + rewriteConfigNumericalOption(state,"cluster-slave-validity-factor",server.cluster_slave_validity_factor,REDIS_CLUSTER_DEFAULT_SLAVE_VALIDITY); rewriteConfigNumericalOption(state,"slowlog-log-slower-than",server.slowlog_log_slower_than,REDIS_SLOWLOG_LOG_SLOWER_THAN); rewriteConfigNumericalOption(state,"slowlog-max-len",server.slowlog_max_len,REDIS_SLOWLOG_MAX_LEN); rewriteConfigNotifykeyspaceeventsOption(state); diff --git a/src/redis.c b/src/redis.c index 5afb24dde..7ffaea023 100644 --- a/src/redis.c +++ b/src/redis.c @@ -1434,6 +1434,7 @@ void initServerConfig() { server.cluster_enabled = 0; server.cluster_node_timeout = REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT; server.cluster_migration_barrier = REDIS_CLUSTER_DEFAULT_MIGRATION_BARRIER; + server.cluster_slave_validity_factor = REDIS_CLUSTER_DEFAULT_SLAVE_VALIDITY; server.cluster_configfile = zstrdup(REDIS_DEFAULT_CLUSTER_CONFIG_FILE); server.lua_caller = NULL; server.lua_time_limit = REDIS_LUA_TIME_LIMIT; diff --git a/src/redis.h b/src/redis.h index e6b7ea93b..aede36b0b 100644 --- a/src/redis.h +++ b/src/redis.h @@ -827,6 +827,7 @@ struct redisServer { char *cluster_configfile; /* Cluster auto-generated config file name. */ struct clusterState *cluster; /* State of the cluster */ int cluster_migration_barrier; /* Cluster replicas migration barrier. */ + int cluster_slave_validity_factor; /* Slave max data age for failover. */ /* Scripting */ lua_State *lua; /* The Lua interpreter. We use just one for all clients */ redisClient *lua_client; /* The "fake client" to query Redis from Lua */