mirror of
http://github.com/valkey-io/valkey
synced 2024-11-22 00:52:38 +00:00
Handle recursive serverAsserts and provide more information for recursive segfaults (#12857)
This change is trying to make two failure modes a bit easier to deep dive: 1. If a serverPanic or serverAssert occurs during the info (or module) printing, it will recursively panic, which is a lot of fun as it will just keep recursively printing. It will eventually stack overflow, but will generate a lot of text in the process. 2. When a segfault happens during the segfault handler, no information is communicated other than it happened. This can be problematic because `info` may help diagnose the real issue, but without fixing the recursive crash it might be hard to get at that info.
This commit is contained in:
parent
c3f8b542ee
commit
068051e378
@ -55,4 +55,5 @@ $TCLSH tests/test_helper.tcl \
|
|||||||
--single unit/moduleapi/async_rm_call \
|
--single unit/moduleapi/async_rm_call \
|
||||||
--single unit/moduleapi/moduleauth \
|
--single unit/moduleapi/moduleauth \
|
||||||
--single unit/moduleapi/rdbloadsave \
|
--single unit/moduleapi/rdbloadsave \
|
||||||
|
--single unit/moduleapi/crash \
|
||||||
"${@}"
|
"${@}"
|
||||||
|
66
src/debug.c
66
src/debug.c
@ -72,10 +72,10 @@ static pthread_mutex_t signal_handler_lock;
|
|||||||
static pthread_mutexattr_t signal_handler_lock_attr;
|
static pthread_mutexattr_t signal_handler_lock_attr;
|
||||||
static volatile int signal_handler_lock_initialized = 0;
|
static volatile int signal_handler_lock_initialized = 0;
|
||||||
/* Forward declarations */
|
/* Forward declarations */
|
||||||
void bugReportStart(void);
|
int bugReportStart(void);
|
||||||
void printCrashReport(void);
|
void printCrashReport(void);
|
||||||
void bugReportEnd(int killViaSignal, int sig);
|
void bugReportEnd(int killViaSignal, int sig);
|
||||||
void logStackTrace(void *eip, int uplevel);
|
void logStackTrace(void *eip, int uplevel, int current_thread);
|
||||||
void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyType);
|
void dbGetStats(char *buf, size_t bufsize, redisDb *db, int full, dbKeyType keyType);
|
||||||
void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret);
|
void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret);
|
||||||
|
|
||||||
@ -1031,15 +1031,17 @@ NULL
|
|||||||
|
|
||||||
__attribute__ ((noinline))
|
__attribute__ ((noinline))
|
||||||
void _serverAssert(const char *estr, const char *file, int line) {
|
void _serverAssert(const char *estr, const char *file, int line) {
|
||||||
bugReportStart();
|
int new_report = bugReportStart();
|
||||||
serverLog(LL_WARNING,"=== ASSERTION FAILED ===");
|
serverLog(LL_WARNING,"=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE ");
|
||||||
serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
|
serverLog(LL_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
|
||||||
|
|
||||||
if (server.crashlog_enabled) {
|
if (server.crashlog_enabled) {
|
||||||
#ifdef HAVE_BACKTRACE
|
#ifdef HAVE_BACKTRACE
|
||||||
logStackTrace(NULL, 1);
|
logStackTrace(NULL, 1, 0);
|
||||||
#endif
|
#endif
|
||||||
printCrashReport();
|
/* If this was a recursive assertion, it what most likely generated
|
||||||
|
* from printCrashReport. */
|
||||||
|
if (new_report) printCrashReport();
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove the signal handler so on abort() we will output the crash report.
|
// remove the signal handler so on abort() we will output the crash report.
|
||||||
@ -1127,16 +1129,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) {
|
|||||||
vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap);
|
vsnprintf(fmtmsg,sizeof(fmtmsg),msg,ap);
|
||||||
va_end(ap);
|
va_end(ap);
|
||||||
|
|
||||||
bugReportStart();
|
int new_report = bugReportStart();
|
||||||
serverLog(LL_WARNING,"------------------------------------------------");
|
serverLog(LL_WARNING,"------------------------------------------------");
|
||||||
serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue");
|
serverLog(LL_WARNING,"!!! Software Failure. Press left mouse button to continue");
|
||||||
serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line);
|
serverLog(LL_WARNING,"Guru Meditation: %s #%s:%d",fmtmsg,file,line);
|
||||||
|
|
||||||
if (server.crashlog_enabled) {
|
if (server.crashlog_enabled) {
|
||||||
#ifdef HAVE_BACKTRACE
|
#ifdef HAVE_BACKTRACE
|
||||||
logStackTrace(NULL, 1);
|
logStackTrace(NULL, 1, 0);
|
||||||
#endif
|
#endif
|
||||||
printCrashReport();
|
/* If this was a recursive panic, it what most likely generated
|
||||||
|
* from printCrashReport. */
|
||||||
|
if (new_report) printCrashReport();
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove the signal handler so on abort() we will output the crash report.
|
// remove the signal handler so on abort() we will output the crash report.
|
||||||
@ -1144,14 +1148,18 @@ void _serverPanic(const char *file, int line, const char *msg, ...) {
|
|||||||
bugReportEnd(0, 0);
|
bugReportEnd(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void bugReportStart(void) {
|
/* Start a bug report, returning 1 if this is the first time this function was called, 0 otherwise. */
|
||||||
|
int bugReportStart(void) {
|
||||||
pthread_mutex_lock(&bug_report_start_mutex);
|
pthread_mutex_lock(&bug_report_start_mutex);
|
||||||
if (bug_report_start == 0) {
|
if (bug_report_start == 0) {
|
||||||
serverLogRaw(LL_WARNING|LL_RAW,
|
serverLogRaw(LL_WARNING|LL_RAW,
|
||||||
"\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n");
|
"\n\n=== REDIS BUG REPORT START: Cut & paste starting from here ===\n");
|
||||||
bug_report_start = 1;
|
bug_report_start = 1;
|
||||||
|
pthread_mutex_unlock(&bug_report_start_mutex);
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
pthread_mutex_unlock(&bug_report_start_mutex);
|
pthread_mutex_unlock(&bug_report_start_mutex);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAVE_BACKTRACE
|
#ifdef HAVE_BACKTRACE
|
||||||
@ -1895,9 +1903,9 @@ static void writeStacktraces(int fd, int uplevel) {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else /* __linux__*/
|
#endif /* __linux__ */
|
||||||
__attribute__ ((noinline))
|
__attribute__ ((noinline))
|
||||||
static void writeStacktraces(int fd, int uplevel) {
|
static void writeCurrentThreadsStackTrace(int fd, int uplevel) {
|
||||||
void *trace[BACKTRACE_MAX_SIZE];
|
void *trace[BACKTRACE_MAX_SIZE];
|
||||||
|
|
||||||
int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE);
|
int trace_size = backtrace(trace, BACKTRACE_MAX_SIZE);
|
||||||
@ -1906,7 +1914,6 @@ static void writeStacktraces(int fd, int uplevel) {
|
|||||||
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
|
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
|
||||||
backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd);
|
backtrace_symbols_fd(trace+uplevel, trace_size-uplevel, fd);
|
||||||
}
|
}
|
||||||
#endif /* __linux__ */
|
|
||||||
|
|
||||||
/* Logs the stack trace using the backtrace() call. This function is designed
|
/* Logs the stack trace using the backtrace() call. This function is designed
|
||||||
* to be called from signal handlers safely.
|
* to be called from signal handlers safely.
|
||||||
@ -1916,7 +1923,7 @@ static void writeStacktraces(int fd, int uplevel) {
|
|||||||
* __attribute__ ((noinline)) to make sure the compiler won't inline them.
|
* __attribute__ ((noinline)) to make sure the compiler won't inline them.
|
||||||
*/
|
*/
|
||||||
__attribute__ ((noinline))
|
__attribute__ ((noinline))
|
||||||
void logStackTrace(void *eip, int uplevel) {
|
void logStackTrace(void *eip, int uplevel, int current_thread) {
|
||||||
int fd = openDirectLogFiledes();
|
int fd = openDirectLogFiledes();
|
||||||
char *msg;
|
char *msg;
|
||||||
uplevel++; /* skip this function */
|
uplevel++; /* skip this function */
|
||||||
@ -1935,7 +1942,17 @@ void logStackTrace(void *eip, int uplevel) {
|
|||||||
|
|
||||||
/* Write symbols to log file */
|
/* Write symbols to log file */
|
||||||
++uplevel;
|
++uplevel;
|
||||||
writeStacktraces(fd, uplevel);
|
#ifdef __linux__
|
||||||
|
if (current_thread) {
|
||||||
|
writeCurrentThreadsStackTrace(fd, uplevel);
|
||||||
|
} else {
|
||||||
|
writeStacktraces(fd, uplevel);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
/* Outside of linux, we only support writing the current thread. */
|
||||||
|
UNUSED(current_thread);
|
||||||
|
writeCurrentThreadsStackTrace(fd, uplevel);
|
||||||
|
#endif
|
||||||
msg = "\n------ STACK TRACE DONE ------\n";
|
msg = "\n------ STACK TRACE DONE ------\n";
|
||||||
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
|
if (write(fd,msg,strlen(msg)) == -1) {/* Avoid warning. */};
|
||||||
|
|
||||||
@ -2218,15 +2235,14 @@ __attribute__ ((noinline))
|
|||||||
static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
|
static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
|
||||||
UNUSED(secret);
|
UNUSED(secret);
|
||||||
UNUSED(info);
|
UNUSED(info);
|
||||||
|
int print_full_crash_info = 1;
|
||||||
/* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */
|
/* Check if it is safe to enter the signal handler. second thread crashing at the same time will deadlock. */
|
||||||
if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) {
|
if(pthread_mutex_lock(&signal_handler_lock) == EDEADLK) {
|
||||||
/* If this thread already owns the lock (meaning we crashed during handling a signal)
|
/* If this thread already owns the lock (meaning we crashed during handling a signal) switch
|
||||||
* log that the crash report can't be generated. */
|
* to printing the minimal information about the crash. */
|
||||||
serverLogRawFromHandler(LL_WARNING,
|
serverLogRawFromHandler(LL_WARNING,
|
||||||
"Crashed running signal handler. Can't continue to generate the crash report");
|
"Crashed running signal handler. Providing reduced version of recursive crash report.");
|
||||||
/* gracefully exit */
|
print_full_crash_info = 0;
|
||||||
bugReportEnd(1, sig);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bugReportStart();
|
bugReportStart();
|
||||||
@ -2260,7 +2276,9 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
|
|||||||
getAndSetMcontextEip(uc, ptr);
|
getAndSetMcontextEip(uc, ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
logStackTrace(eip, 1);
|
/* When printing the reduced crash info, just print the current thread
|
||||||
|
* to avoid race conditions with the multi-threaded stack collector. */
|
||||||
|
logStackTrace(eip, 1, !print_full_crash_info);
|
||||||
|
|
||||||
if (eip == info->si_addr) {
|
if (eip == info->si_addr) {
|
||||||
/* Restore old eip */
|
/* Restore old eip */
|
||||||
@ -2270,7 +2288,7 @@ static void sigsegvHandler(int sig, siginfo_t *info, void *secret) {
|
|||||||
logRegisters(uc);
|
logRegisters(uc);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printCrashReport();
|
if (print_full_crash_info) printCrashReport();
|
||||||
|
|
||||||
#ifdef HAVE_BACKTRACE
|
#ifdef HAVE_BACKTRACE
|
||||||
if (eip != NULL)
|
if (eip != NULL)
|
||||||
@ -2430,7 +2448,7 @@ void sigalrmSignalHandler(int sig, siginfo_t *info, void *secret) {
|
|||||||
serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM");
|
serverLogRawFromHandler(LL_WARNING, "\nReceived SIGALRM");
|
||||||
}
|
}
|
||||||
#ifdef HAVE_BACKTRACE
|
#ifdef HAVE_BACKTRACE
|
||||||
logStackTrace(getAndSetMcontextEip(uc, NULL), 1);
|
logStackTrace(getAndSetMcontextEip(uc, NULL), 1, 0);
|
||||||
#else
|
#else
|
||||||
serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace().");
|
serverLogRawFromHandler(LL_WARNING,"Sorry: no support for backtrace().");
|
||||||
#endif
|
#endif
|
||||||
|
@ -4,10 +4,10 @@ uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
|
|||||||
|
|
||||||
warning_cflags = -W -Wall -Wno-missing-field-initializers
|
warning_cflags = -W -Wall -Wno-missing-field-initializers
|
||||||
ifeq ($(uname_S),Darwin)
|
ifeq ($(uname_S),Darwin)
|
||||||
SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=c99 -O2
|
SHOBJ_CFLAGS ?= $(warning_cflags) -dynamic -fno-common -g -ggdb -std=gnu11 -O2
|
||||||
SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup
|
SHOBJ_LDFLAGS ?= -bundle -undefined dynamic_lookup
|
||||||
else # Linux, others
|
else # Linux, others
|
||||||
SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=c99 -O2
|
SHOBJ_CFLAGS ?= $(warning_cflags) -fno-common -g -ggdb -std=gnu11 -O2
|
||||||
SHOBJ_LDFLAGS ?= -shared
|
SHOBJ_LDFLAGS ?= -shared
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@ -62,7 +62,8 @@ TEST_MODULES = \
|
|||||||
usercall.so \
|
usercall.so \
|
||||||
postnotifications.so \
|
postnotifications.so \
|
||||||
moduleauthtwo.so \
|
moduleauthtwo.so \
|
||||||
rdbloadsave.so
|
rdbloadsave.so \
|
||||||
|
crash.so
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
|
|
||||||
|
39
tests/modules/crash.c
Normal file
39
tests/modules/crash.c
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
#include "redismodule.h"
|
||||||
|
|
||||||
|
#include <strings.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
|
||||||
|
#define UNUSED(V) ((void) V)
|
||||||
|
|
||||||
|
void assertCrash(RedisModuleInfoCtx *ctx, int for_crash_report) {
|
||||||
|
UNUSED(ctx);
|
||||||
|
UNUSED(for_crash_report);
|
||||||
|
RedisModule_Assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
void segfaultCrash(RedisModuleInfoCtx *ctx, int for_crash_report) {
|
||||||
|
UNUSED(ctx);
|
||||||
|
UNUSED(for_crash_report);
|
||||||
|
/* Compiler gives warnings about writing to a random address
|
||||||
|
* e.g "*((char*)-1) = 'x';". As a workaround, we map a read-only area
|
||||||
|
* and try to write there to trigger segmentation fault. */
|
||||||
|
char *p = mmap(NULL, 4096, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||||
|
*p = 'x';
|
||||||
|
}
|
||||||
|
|
||||||
|
int RedisModule_OnLoad(RedisModuleCtx *ctx, RedisModuleString **argv, int argc) {
|
||||||
|
REDISMODULE_NOT_USED(argv);
|
||||||
|
REDISMODULE_NOT_USED(argc);
|
||||||
|
if (RedisModule_Init(ctx,"infocrash",1,REDISMODULE_APIVER_1)
|
||||||
|
== REDISMODULE_ERR) return REDISMODULE_ERR;
|
||||||
|
RedisModule_Assert(argc == 1);
|
||||||
|
if (!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "segfault")) {
|
||||||
|
if (RedisModule_RegisterInfoFunc(ctx, segfaultCrash) == REDISMODULE_ERR) return REDISMODULE_ERR;
|
||||||
|
} else if(!strcasecmp(RedisModule_StringPtrLen(argv[0], NULL), "assert")) {
|
||||||
|
if (RedisModule_RegisterInfoFunc(ctx, assertCrash) == REDISMODULE_ERR) return REDISMODULE_ERR;
|
||||||
|
} else {
|
||||||
|
return REDISMODULE_ERR;
|
||||||
|
}
|
||||||
|
|
||||||
|
return REDISMODULE_OK;
|
||||||
|
}
|
52
tests/unit/moduleapi/crash.tcl
Normal file
52
tests/unit/moduleapi/crash.tcl
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
# This file is used to test certain crash edge cases to make sure they produce
|
||||||
|
# correct stack traces for debugging.
|
||||||
|
set testmodule [file normalize tests/modules/crash.so]
|
||||||
|
|
||||||
|
# Valgrind will complain that the process terminated by a signal, skip it.
|
||||||
|
if {!$::valgrind} {
|
||||||
|
start_server {tags {"modules"}} {
|
||||||
|
r module load $testmodule assert
|
||||||
|
test {Test module crash when info crashes with an assertion } {
|
||||||
|
catch {r 0 info infocrash}
|
||||||
|
set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
|
||||||
|
set res [wait_for_log_messages 0 {"*ASSERTION FAILED*"} $loglines 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
|
||||||
|
set res [wait_for_log_messages 0 {"*RECURSIVE ASSERTION FAILED*"} $loglines 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
|
||||||
|
wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000
|
||||||
|
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="]
|
||||||
|
assert_equal 2 [count_log_message 0 "ASSERTION FAILED"]
|
||||||
|
# There will be 3 crash assertions, 1 in the first stack trace and 2 in the second
|
||||||
|
assert_equal 3 [count_log_message 0 "assertCrash"]
|
||||||
|
assert_equal 1 [count_log_message 0 "RECURSIVE ASSERTION FAILED"]
|
||||||
|
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
start_server {tags {"modules"}} {
|
||||||
|
r module load $testmodule segfault
|
||||||
|
test {Test module crash when info crashes with a segfault} {
|
||||||
|
catch {r 0 info infocrash}
|
||||||
|
set res [wait_for_log_messages 0 {"*=== REDIS BUG REPORT START: Cut & paste starting from here ===*"} 0 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
|
||||||
|
set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
|
||||||
|
set res [wait_for_log_messages 0 {"*Crashed running signal handler. Providing reduced version of recursive crash report*"} $loglines 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
set res [wait_for_log_messages 0 {"*Crashed running the instruction at*"} $loglines 10 1000]
|
||||||
|
set loglines [lindex $res 1]
|
||||||
|
|
||||||
|
wait_for_log_messages 0 {"*=== REDIS BUG REPORT END. Make sure to include from START to END. ===*"} $loglines 10 1000
|
||||||
|
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT END. Make sure to include from START to END. ==="]
|
||||||
|
assert_equal 1 [count_log_message 0 "Crashed running signal handler. Providing reduced version of recursive crash report"]
|
||||||
|
assert_equal 2 [count_log_message 0 "Crashed running the instruction at"]
|
||||||
|
assert_equal 1 [count_log_message 0 "=== REDIS BUG REPORT START: Cut & paste starting from here ==="]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user