valkey/tests/integration/corrupt-dump-fuzzer.tcl

# tests of corrupt listpack payload with valid CRC

# The fuzzer can cause corrupt the state in many places, which could
# mess up the reply, so we decided to skip logreqres.
tags {"dump" "corruption" "external:skip" "logreqres:skip"} {

# catch sigterm so that in case one of the random command hangs the test,
# usually due to the server not putting a response in the output buffers,
# we'll know which command it was
if { ! [ catch {
    package require Tclx
} err ] } {
    signal error SIGTERM
}

proc generate_collections {suffix elements} {
    set rd [valkey_deferring_client]
    for {set j 0} {$j < $elements} {incr j} {
        # add both string values and integers
        if {$j % 2 == 0} {set val $j} else {set val "_$j"}
        $rd hset hash$suffix $j $val
        $rd lpush list$suffix $val
        $rd zadd zset$suffix $j $val
        $rd sadd set$suffix $val
        $rd xadd stream$suffix * item 1 value $val
    }
    for {set j 0} {$j < $elements * 5} {incr j} {
        $rd read ; # Discard replies
    }
    $rd close
}

# generate keys with various types and encodings
proc generate_types {} {
    r config set list-max-ziplist-size 5
    r config set hash-max-ziplist-entries 5
    r config set set-max-listpack-entries 5
    r config set zset-max-ziplist-entries 5
    r config set stream-node-max-entries 5

    # create small (ziplist / listpack encoded) objects with 3 items
    generate_collections "" 3

    # add some metadata to the stream
    r xgroup create stream mygroup 0
    set records [r xreadgroup GROUP mygroup Alice COUNT 2 STREAMS stream >]
    r xdel stream [lindex [lindex [lindex [lindex $records 0] 1] 1] 0]
    r xack stream mygroup [lindex [lindex [lindex [lindex $records 0] 1] 0] 0]

    # create other non-collection types
    r incr int
    r set string str

    # create bigger objects with 10 items (more than a single ziplist / listpack)
    generate_collections big 10

    # make sure our big stream also has a listpack record that has different
    # field names than the master recorded
    r xadd streambig * item 1 value 1
    r xadd streambig * item 1 unique value
}

proc corrupt_payload {payload} {
    set len [string length $payload]
    set count 1 ;# usually corrupt only one byte
    if {rand() > 0.9} { set count 2 }
    while { $count > 0 } {
        set idx [expr {int(rand() * $len)}]
        set ch [binary format c [expr {int(rand()*255)}]]
        set payload [string replace $payload $idx $idx $ch]
        incr count -1
    }
    return $payload
}

# fuzzy tester for corrupt RESTORE payloads
# valgrind will make sure there were no leaks in the rdb loader error handling code
foreach sanitize_dump {no yes} {
    if {$::accurate} {
        set min_duration [expr {60 * 10}] ;# run at least 10 minutes
        set min_cycles 1000 ;# run at least 1k cycles (max 16 minutes)
    } else {
        set min_duration 10 ; # run at least 10 seconds
        set min_cycles 10 ; # run at least 10 cycles
    }

    # Don't execute this on FreeBSD due to a yet-undiscovered memory issue
    # which causes tclsh to bloat.
    if {[exec uname] == "FreeBSD"} {
        set min_cycles 1
        set min_duration 1
    }

    test "Fuzzer corrupt restore payloads - sanitize_dump: $sanitize_dump" {
        if {$min_duration * 2 > $::timeout} {
            fail "insufficient timeout"
        }
        # start a server, fill with data and save an RDB file once (avoid re-save)
        start_server [list overrides [list "save" "" use-exit-on-panic yes crash-memcheck-enabled no loglevel verbose] ] {
            set stdout [srv 0 stdout]
            r config set sanitize-dump-payload $sanitize_dump
            r debug set-skip-checksum-validation 1
            set start_time [clock seconds]
            generate_types
            set dbsize [r dbsize]
            r save
            set cycle 0
            set stat_terminated_in_restore 0
            set stat_terminated_in_traffic 0
            set stat_terminated_by_signal 0
            set stat_successful_restore 0
            set stat_rejected_restore 0
            set stat_traffic_commands_sent 0
            # repeatedly DUMP a random key, corrupt it and try RESTORE into a new key
            while true {
                set k [r randomkey]
                set dump [r dump $k]
                set dump [corrupt_payload $dump]
                set printable_dump [string2printable $dump]
                set restore_failed false
                set report_and_restart false
                set sent {}
                # RESTORE can fail, but hopefully not terminate
                if { [catch { r restore "_$k" 0 $dump REPLACE } err] } {
                    set restore_failed true
                    # skip if return failed with an error response.
                    if {[string match "ERR*" $err]} {
                        incr stat_rejected_restore
                    } else {
                        set report_and_restart true
                        incr stat_terminated_in_restore
                        write_log_line 0 "corrupt payload: $printable_dump"
                        if {$sanitize_dump == yes} {
                            puts "Server crashed in RESTORE with payload: $printable_dump"
                        }
                    }
                } else {
                    r ping ;# an attempt to check if the server didn't terminate (this will throw an error that will terminate the tests)
                }

                set print_commands false
                if {!$restore_failed} {
                    # if RESTORE didn't fail or terminate, run some random traffic on the new key
                    incr stat_successful_restore
                    if { [ catch {
                        set sent [generate_fuzzy_traffic_on_key "_$k" 1] ;# traffic for 1 second
                        incr stat_traffic_commands_sent [llength $sent]
                        r del "_$k" ;# in case the server terminated, here's where we'll detect it.
                        if {$dbsize != [r dbsize]} {
                            puts "unexpected keys"
                            puts "keys: [r keys *]"
                            puts "commands leading to it:"
                            foreach cmd $sent {
                                foreach arg $cmd {
                                    puts -nonewline "[string2printable $arg] "
                                }
                                puts ""
                            }
                            exit 1
                        }
                    } err ] } {
                        set err [format "%s" $err] ;# convert to string for pattern matching
                        if {[string match "*SIGTERM*" $err]} {
                            puts "payload that caused test to hang: $printable_dump"
                            if {$::dump_logs} {
                                set srv [get_srv 0]
                                dump_server_log $srv
                            }
                            exit 1
                        }
                        # if the server terminated update stats and restart it
                        set report_and_restart true
                        incr stat_terminated_in_traffic
                        set by_signal [count_log_message 0 "crashed by signal"]
                        incr stat_terminated_by_signal $by_signal

                        if {$by_signal != 0 || $sanitize_dump == yes} {
                            if {$::dump_logs} {
                                set srv [get_srv 0]
                                dump_server_log $srv
                            }

                            puts "Server crashed (by signal: $by_signal), with payload: $printable_dump"
                            set print_commands true
                        }
                    }
                }

                # check valgrind report for invalid reads after each RESTORE
                # payload so that we have a report that is easier to reproduce
                set valgrind_errors [find_valgrind_errors [srv 0 stderr] false]
                set asan_errors [sanitizer_errors_from_file [srv 0 stderr]]
                if {$valgrind_errors != "" || $asan_errors != ""} {
                    puts "valgrind or asan found an issue for payload: $printable_dump"
                    set report_and_restart true
                    set print_commands true
                }

                if {$report_and_restart} {
                    if {$print_commands} {
                        puts "violating commands:"
                        foreach cmd $sent {
                            foreach arg $cmd {
                                puts -nonewline "[string2printable $arg] "
                            }
                            puts ""
                        }
                    }

                    # restart the server and re-apply debug configuration
                    write_log_line 0 "corrupt payload: $printable_dump"
                    restart_server 0 true true
                    r config set sanitize-dump-payload $sanitize_dump
                    r debug set-skip-checksum-validation 1
                }

                incr cycle
                if { ([clock seconds]-$start_time) >= $min_duration && $cycle >= $min_cycles} {
                    break
                }
            }
            if {$::verbose} {
                puts "Done $cycle cycles in [expr {[clock seconds]-$start_time}] seconds."
                puts "RESTORE: successful: $stat_successful_restore, rejected: $stat_rejected_restore"
                puts "Total commands sent in traffic: $stat_traffic_commands_sent, crashes during traffic: $stat_terminated_in_traffic ($stat_terminated_by_signal by signal)."
            }
        }
        # if we run sanitization we never expect the server to crash at runtime
        if {$sanitize_dump == yes} {
            assert_equal $stat_terminated_in_restore 0
            assert_equal $stat_terminated_in_traffic 0
        }
        # make sure all terminations where due to assertion and not a SIGSEGV
        assert_equal $stat_terminated_by_signal 0
    }
}


} ;# tags