Fix occasional hangs on replication reconnection. (#7830)

This happens only on diskless replicas when attempting to reconnect after 
failing to load an RDB file. It is more likely to occur with larger datasets.

After reconnection is initiated, replicationEmptyDbCallback() may get called 
and try to write to an unconnected socket. This triggered another issue where
the connection is put into an error state and the connect handler never gets
called. The problem is a regression introduced by commit c17e597.

(cherry picked from commit 1980f639b1)
This commit is contained in:
Yossi Gottlieb 2020-09-22 11:38:52 +03:00 committed by Oran Agra
parent 6a4da4958e
commit 9d0388a043
2 changed files with 14 additions and 3 deletions

View File

@ -168,7 +168,12 @@ static int connSocketWrite(connection *conn, const void *data, size_t data_len)
int ret = write(conn->fd, data, data_len);
if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
conn->state = CONN_STATE_ERROR;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;
@ -180,7 +185,12 @@ static int connSocketRead(connection *conn, void *buf, size_t buf_len) {
conn->state = CONN_STATE_CLOSED;
} else if (ret < 0 && errno != EAGAIN) {
conn->last_errno = errno;
conn->state = CONN_STATE_ERROR;
/* Don't overwrite the state of a connection that is not already
* connected, not to mess with handler callbacks.
*/
if (conn->state == CONN_STATE_CONNECTED)
conn->state = CONN_STATE_ERROR;
}
return ret;

View File

@ -1374,7 +1374,8 @@ void replicationSendNewlineToMaster(void) {
* the new dataset received by the master. */
void replicationEmptyDbCallback(void *privdata) {
UNUSED(privdata);
replicationSendNewlineToMaster();
if (server.repl_state == REPL_STATE_TRANSFER)
replicationSendNewlineToMaster();
}
/* Once we have a link with the master and the synchronization was