Handle recursive serverAsserts and provide more information for recursive segfaults (#12857)

This change is trying to make two failure modes a bit easier to deep dive:
1. If a serverPanic or serverAssert occurs during the info (or module)
printing, it will recursively panic, which is a lot of fun as it will
just keep recursively printing. It will eventually stack overflow, but
will generate a lot of text in the process.
2. When a segfault happens during the segfault handler, no information
is communicated other than it happened. This can be problematic because
`info` may help diagnose the real issue, but without fixing the
recursive crash it might be hard to get at that info.
This commit is contained in:
Madelyn Olson 2024-01-02 18:20:22 -08:00 committed by GitHub
parent c3f8b542ee
commit 068051e378
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 138 additions and 27 deletions

View File

@ -55,4 +55,5 @@ $TCLSH tests/test_helper.tcl \
--single unit/moduleapi/async_rm_call \ --single unit/moduleapi/async_rm_call \
--single unit/moduleapi/moduleauth \ --single unit/moduleapi/moduleauth \
--single unit/moduleapi/rdbloadsave \ --single unit/moduleapi/rdbloadsave \
--single unit/moduleapi/crash \
"${@}" "${@}"

View File

@ -72,10 +72,10 @@ static pthread_mutex_t signal_handler_lock;
static pthread_mutexattr_t signal_handler_lock_attr; static pthread_mutexattr_t signal_handler_lock_attr;
static volatile int signal_handler_lock_initialized = 0; static volatile int signal_handler_lock_initialized = 0;
/* Forward declarations */ /* Forward declarations */
void bugReportStart(void); int bugReportStart(void);
void printCrashReport(void); void printCrashReport(void);
void bugReportEnd(int killViaSignal, int sig); void bugReportEnd(int killViaSignal, int sig);
void logStackTrace(void *eip, int uplevel); void logStackTrace(void *eip, int uplevel, int current_thread);
void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyType); void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyType);
void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret); void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret);
@ -1031,15 +1031,17 @@ NULL
__attribute__ ((noinline)) __attribute__ ((noinline))
void _serverAssert(const char *estr, const char *file, int line) { void _serverAssert(const char *estr, const char *file, int line) {
bugReportStart(); int new_report = bugReportStart();
serverLog(LL_WARNING,"=== ASSERTION FAILED ==="); serverLog(LL_WARNING,"=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE ");
serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr); serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
if (server.crashlog_enabled) { if (server.crashlog_enabled) {
#ifdef HAVE_BACKTRACE #ifdef HAVE_BACKTRACE
logStackTrace(NULL, 1); logStackTrace(NULL, 1, 0);
#endif #endif
printCrashReport(); /* If this was a recursive assertion, it what most likely generated
* from printCrashReport. */
if (new_report) printCrashReport();
} }
// remove the signal handler so on abort() we will output the crash report. // remove the signal handler so on abort() we will output the crash report.
@ -1127,16 +1129,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) {
vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap); vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap);
va_end(ap); va_end(ap);
bugReportStart(); int new_report = bugReportStart();
serverLog(LL_WARNING,"------------------------------------------------"); serverLog(LL_WARNING,"------------------------------------------------");
serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue"); serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue");
serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line); serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line);
if (server.crashlog_enabled) { if (server.crashlog_enabled) {
#ifdef HAVE_BACKTRACE #ifdef HAVE_BACKTRACE
logStackTrace(NULL, 1); logStackTrace(NULL, 1, 0);
#endif #endif
printCrashReport(); /* If this was a recursive panic, it what most likely generated
* from printCrashReport. */
if (new_report) printCrashReport();
} }
// remove the signal handler so on abort() we will output the crash report. // remove the signal handler so on abort() we will output the crash report.
@ -1144,14 +1148,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) {
bugReportEnd(0, 0); bugReportEnd(0, 0);
} }
void bugReportStart(void) { /* Start a bug report, returning 1 if this is the first time this function was called, 0 otherwise. */
int bugReportStart(void) {
pthread_mutex_lock(&bug_report_start_mutex); pthread_mutex_lock(&bug_report_start_mutex);
if (bug_report_start == 0) { if (bug_report_start == 0) {
serverLogRaw(LL_WARNING|LL_RAW, serverLogRaw(LL_WARNING|LL_RAW,
"\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n"); "\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n");
bug_report_start = 1; bug_report_start = 1;
pthread_mutex_unlock(&bug_report_start_mutex);
return 1;
} }
pthread_mutex_unlock(&bug_report_start_mutex); pthread_mutex_unlock(&bug_report_start_mutex);
return 0;
} }
#ifdef HAVE_BACKTRACE #ifdef HAVE_BACKTRACE
@ -1895,9 +1903,9 @@ static void writeStacktraces(int fd, int uplevel) {
} }
#else /* __linux__*/ #endif /* __linux__ */
__attribute__ ((noinline)) __attribute__ ((noinline))
static void writeStacktraces(int fd, int uplevel) { static void writeCurrentThreadsStackTrace(int fd, int uplevel) {
void *trace[BACKTRACE_MAX_SIZE]; void *trace[BACKTRACE_MAX_SIZE];
int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE); int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE);
@ -1906,7 +1914,6 @@ static void writeStacktraces(int fd, int uplevel) {
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd); backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd);
} }
#endif /* __linux__ */
/* Logs the stack trace using the backtrace() call. This function is designed /* Logs the stack trace using the backtrace() call. This function is designed
* to be called from signal handlers safely. * to be called from signal handlers safely.
@ -1916,7 +1923,7 @@ static void writeStacktraces(int fd, int uplevel) {
* __attribute__ ((noinline)) to make sure the compiler won't inline them. * __attribute__ ((noinline)) to make sure the compiler won't inline them.
*/ */
__attribute__ ((noinline)) __attribute__ ((noinline))
void logStackTrace(void *eip, int uplevel) { void logStackTrace(void *eip, int uplevel, int current_thread) {
int fd = openDirectLogFiledes(); int fd = openDirectLogFiledes();
char *msg; char *msg;
uplevel++; /* skip this function */ uplevel++; /* skip this function */
@ -1935,7 +1942,17 @@ void logStackTrace(void *eip, int uplevel) {
/* Write symbols to log file */ /* Write symbols to log file */
++uplevel; ++uplevel;
writeStacktraces(fd, uplevel); #ifdef __linux__
if (current_thread) {
writeCurrentThreadsStackTrace(fd, uplevel);
} else {
writeStacktraces(fd, uplevel);
}
#else
/* Outside of linux, we only support writing the current thread. */
UNUSED(current_thread);
writeCurrentThreadsStackTrace(fd, uplevel);
#endif
msg = "\n------ STACK TRACE DONE ------\n"; msg = "\n------ STACK TRACE DONE ------\n";
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */}; if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
@ -2218,15 +2235,14 @@ __attribute__ ((noinline))
static void sigsegvHandler(int sig, siginfo_t *info, void *secret) { static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
UNUSED(secret); UNUSED(secret);
UNUSED(info); UNUSED(info);
int print_full_crash_info = 1;
/* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */ /* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */
if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) { if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) {
/* If this thread already owns the lock (meaning we crashed during handling a signal) /* If this thread already owns the lock (meaning we crashed during handling a signal) switch
* log that the crash report can't be generated. */ * to printing the minimal information about the crash. */
serverLogRawFromHandler(LL_WARNING, serverLogRawFromHandler(LL_WARNING,
"Crashed running signal handler. Can't continue to generate the crash report"); "Crashed running signal handler. Providing reduced version of recursive crash report.");
/* gracefully exit */ print_full_crash_info = 0;
bugReportEnd(1, sig);
return;
} }
bugReportStart(); bugReportStart();
@ -2260,7 +2276,9 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
getAndSetMcontextEip(uc, ptr); getAndSetMcontextEip(uc, ptr);
} }
logStackTrace(eip, 1); /* When printing the reduced crash info, just print the current thread
* to avoid race conditions with the multi-threaded stack collector. */
logStackTrace(eip, 1, !print_full_crash_info);
if (eip == info->si_addr) { if (eip == info->si_addr) {
/* Restore old eip */ /* Restore old eip */
@ -2270,7 +2288,7 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
logRegisters(uc); logRegisters(uc);
#endif #endif
printCrashReport(); if (print_full_crash_info) printCrashReport();
#ifdef HAVE_BACKTRACE #ifdef HAVE_BACKTRACE
if (eip != NULL) if (eip != NULL)
@ -2430,7 +2448,7 @@ void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) {
serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM"); serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM");
} }
#ifdef HAVE_BACKTRACE #ifdef HAVE_BACKTRACE
logStackTrace(getAndSetMcontextEip(uc, NULL), 1); logStackTrace(getAndSetMcontextEip(uc, NULL), 1, 0);
#else #else
serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace()."); serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace().");
#endif #endif

View File

@ -4,10 +4,10 @@ uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
warning_cflags = -W -Wall -Wno-missing-field-initializers warning_cflags = -W -Wall -Wno-missing-field-initializers
ifeq ($(uname_S),Darwin) ifeq ($(uname_S),Darwin)
SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=c99 -O2 SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=gnu11 -O2
SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup
else # Linux, others else # Linux, others
SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=c99 -O2 SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=gnu11 -O2
SHOBJ_LDFLAGS ?= -shared SHOBJ_LDFLAGS ?= -shared
endif endif
@ -62,7 +62,8 @@ TEST_MODULES = \
usercall.so \ usercall.so \
postnotifications.so \ postnotifications.so \
moduleauthtwo.so \ moduleauthtwo.so \
rdbloadsave.so rdbloadsave.so \
crash.so
.PHONY: all .PHONY: all

39
tests/modules/crash.c Normal file
View File

@ -0,0 +1,39 @@
#include "redismodule.h"
#include <strings.h>
#include <sys/mman.h>
#define UNUSED(V) ((void) V)
void assertCrash(RedisModuleInfoCtx *ctx, int for_crash_report) {
UNUSED(ctx);
UNUSED(for_crash_report);
RedisModule_Assert(0);
}
void segfaultCrash(RedisModuleInfoCtx *ctx, int for_crash_report) {
UNUSED(ctx);
UNUSED(for_crash_report);
/* Compiler gives warnings about writing to a random address
* e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area
* and try to write there to trigger segmentation fault. */
char *p = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
*p = 'x';
}
int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
REDISMODULE_NOT_USED(argv);
REDISMODULE_NOT_USED(argc);
if (RedisModule_Init(ctx,"infocrash",1,REDISMODULE_APIVER_1)
== REDISMODULE_ERR) return REDISMODULE_ERR;
RedisModule_Assert(argc == 1);
if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "segfault")) {
if (RedisModule_RegisterInfoFunc(ctx, segfaultCrash) == REDISMODULE_ERR) return REDISMODULE_ERR;
} else if(!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "assert")) {
if (RedisModule_RegisterInfoFunc(ctx, assertCrash) == REDISMODULE_ERR) return REDISMODULE_ERR;
} else {
return REDISMODULE_ERR;
}
return REDISMODULE_OK;
}

View File

@ -0,0 +1,52 @@
# This file is used to test certain crash edge cases to make sure they produce
# correct stack traces for debugging.
set testmodule [file normalize tests/modules/crash.so]
# Valgrind will complain that the process terminated by a signal, skip it.
if {!$::valgrind} {
start_server {tags {"modules"}} {
r module load $testmodule assert
test {Test module crash when info crashes with an assertion } {
catch {r 0 info infocrash}
set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*ASSERTION FAILED*"} $loglines 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*RECURSIVE ASSERTION FAILED*"} $loglines 10 1000]
set loglines [lindex $res 1]
wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="]
assert_equal 2 [count_log_message 0 "ASSERTION FAILED"]
# There will be 3 crash assertions, 1 in the first stack trace and 2 in the second
assert_equal 3 [count_log_message 0 "assertCrash"]
assert_equal 1 [count_log_message 0 "RECURSIVE ASSERTION FAILED"]
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="]
}
}
start_server {tags {"modules"}} {
r module load $testmodule segfault
test {Test module crash when info crashes with a segfault} {
catch {r 0 info infocrash}
set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*Crashed running signal handler. Providing reduced version of recursive crash report*"} $loglines 10 1000]
set loglines [lindex $res 1]
set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000]
set loglines [lindex $res 1]
wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="]
assert_equal 1 [count_log_message 0 "Crashed running signal handler. Providing reduced version of recursive crash report"]
assert_equal 2 [count_log_message 0 "Crashed running the instruction at"]
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="]
}
}
}