From aa585f7da09bcb9867a128b1845a677e886cf2dc Mon Sep 17 00:00:00 2001 From: Rob Landers Date: Thu, 3 Oct 2024 21:53:12 +0200 Subject: [PATCH] handle worker failures gracefully (#1038) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * handle failures gracefully * fix super-subtle race condition * address feedback: panic instead of fatal log and make vars into consts * pass the frankenphp context to worker-ready function * reset backoff and failures on normal restart * update docs * add test and fix race condition * fail sometimes but do not be pathological about it * Use title case Co-authored-by: Kévin Dunglas * fix code style in php * define lifecycle metrics * ensure we update unregister the metrics and fix tests * update caddy tests and fix typo * update docs * no need for this --------- Co-authored-by: Kévin Dunglas --- caddy/caddy_test.go | 30 +++++++++++++ docs/metrics.md | 3 ++ docs/worker.md | 8 ++++ frankenphp.c | 2 +- frankenphp.go | 3 ++ frankenphp_test.go | 12 +++++ metrics.go | 87 +++++++++++++++++++++++++++++++++++-- metrics_test.go | 3 ++ testdata/failing-worker.php | 18 ++++++++ worker.go | 79 +++++++++++++++++++++++++++++++-- 10 files changed, 237 insertions(+), 8 deletions(-) create mode 100644 testdata/failing-worker.php diff --git a/caddy/caddy_test.go b/caddy/caddy_test.go index cc750d6..5dbafe4 100644 --- a/caddy/caddy_test.go +++ b/caddy/caddy_test.go @@ -445,6 +445,18 @@ func TestWorkerMetrics(t *testing.T) { # HELP frankenphp_testdata_index_php_worker_request_count # TYPE frankenphp_testdata_index_php_worker_request_count counter frankenphp_testdata_index_php_worker_request_count 10 + + # HELP frankenphp_testdata_index_php_ready_workers Running workers that have successfully called frankenphp_handle_request at least once + # TYPE frankenphp_testdata_index_php_ready_workers gauge + frankenphp_testdata_index_php_ready_workers 2 + + # HELP frankenphp_testdata_index_php_worker_crashes Number of PHP worker crashes for this worker + # TYPE frankenphp_testdata_index_php_worker_crashes counter + frankenphp_testdata_index_php_worker_crashes 0 + + # HELP frankenphp_testdata_index_php_worker_restarts Number of PHP worker restarts for this worker + # TYPE frankenphp_testdata_index_php_worker_restarts counter + frankenphp_testdata_index_php_worker_restarts 0 ` require.NoError(t, @@ -456,6 +468,9 @@ func TestWorkerMetrics(t *testing.T) { "frankenphp_testdata_index_php_busy_workers", "frankenphp_testdata_index_php_total_workers", "frankenphp_testdata_index_php_worker_request_count", + "frankenphp_testdata_index_php_worker_crashes", + "frankenphp_testdata_index_php_worker_restarts", + "frankenphp_testdata_index_php_ready_workers", )) } @@ -531,6 +546,18 @@ func TestAutoWorkerConfig(t *testing.T) { # HELP frankenphp_testdata_index_php_worker_request_count # TYPE frankenphp_testdata_index_php_worker_request_count counter frankenphp_testdata_index_php_worker_request_count 10 + + # HELP frankenphp_testdata_index_php_ready_workers Running workers that have successfully called frankenphp_handle_request at least once + # TYPE frankenphp_testdata_index_php_ready_workers gauge + frankenphp_testdata_index_php_ready_workers ` + workers + ` + + # HELP frankenphp_testdata_index_php_worker_crashes Number of PHP worker crashes for this worker + # TYPE frankenphp_testdata_index_php_worker_crashes counter + frankenphp_testdata_index_php_worker_crashes 0 + + # HELP frankenphp_testdata_index_php_worker_restarts Number of PHP worker restarts for this worker + # TYPE frankenphp_testdata_index_php_worker_restarts counter + frankenphp_testdata_index_php_worker_restarts 0 ` require.NoError(t, @@ -542,5 +569,8 @@ func TestAutoWorkerConfig(t *testing.T) { "frankenphp_testdata_index_php_busy_workers", "frankenphp_testdata_index_php_total_workers", "frankenphp_testdata_index_php_worker_request_count", + "frankenphp_testdata_index_php_worker_crashes", + "frankenphp_testdata_index_php_worker_restarts", + "frankenphp_testdata_index_php_ready_workers", )) } diff --git a/docs/metrics.md b/docs/metrics.md index 3ee4c84..d150205 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -6,6 +6,9 @@ When [Caddy metrics](https://caddyserver.com/docs/metrics) are enabled, FrankenP * `frankenphp_[worker]_busy_workers`: The number of workers currently processing a request. * `frankenphp_[worker]_worker_request_time`: The time spent processing requests by all workers. * `frankenphp_[worker]_worker_request_count`: The number of requests processed by all workers. +* `frankenphp_[worker]_ready_workers`: The number of workers that have called `frankenphp_handle_request` at least once. +* `frankenphp_[worker]_worker_crashes`: The number of times a worker has unexpectedly terminated. +* `frankenphp_[worker]_worker_restarts`: The number of times a worker has been deliberately restarted. * `frankenphp_total_threads`: The total number of PHP threads. * `frankenphp_busy_threads`: The number of PHP threads currently processing a request (running workers always consume a thread). diff --git a/docs/worker.md b/docs/worker.md index c2f4c0e..0ede0db 100644 --- a/docs/worker.md +++ b/docs/worker.md @@ -121,6 +121,14 @@ A workaround to using this type of code in worker mode is to restart the worker The previous worker snippet allows configuring a maximum number of request to handle by setting an environment variable named `MAX_REQUESTS`. +### Worker Failures + +If a worker script crashes with a non-zero exit code, FrankenPHP will restart it with an exponential backoff strategy. +If the worker script stays up longer than the last backoff * 2, +it will not penalize the worker script and restart it again. +However, if the worker script continues to fail with a non-zero exit code in a short period of time +(for example, having a typo in a script), FrankenPHP will crash with the error: `too many consecutive failures`. + ## Superglobals Behavior [PHP superglobals](https://www.php.net/manual/en/language.variables.superglobals.php) (`$_SERVER`, `$_ENV`, `$_GET`...) diff --git a/frankenphp.c b/frankenphp.c index 747f788..7bcc476 100644 --- a/frankenphp.c +++ b/frankenphp.c @@ -342,7 +342,7 @@ PHP_FUNCTION(frankenphp_handle_request) { ctx->worker_ready = true; /* Mark the worker as ready to handle requests */ - go_frankenphp_worker_ready(); + go_frankenphp_worker_ready(ctx->main_request); } #ifdef ZEND_MAX_EXECUTION_TIMERS diff --git a/frankenphp.go b/frankenphp.go index 9b4f492..6b6c2ac 100644 --- a/frankenphp.go +++ b/frankenphp.go @@ -125,6 +125,9 @@ type FrankenPHPContext struct { // Whether the request is already closed by us closed sync.Once + // whether the context is ready to receive requests + ready bool + responseWriter http.ResponseWriter exitStatus C.int diff --git a/frankenphp_test.go b/frankenphp_test.go index 733ec45..302b37f 100644 --- a/frankenphp_test.go +++ b/frankenphp_test.go @@ -609,6 +609,18 @@ func testRequestHeaders(t *testing.T, opts *testOptions) { }, opts) } +func TestFailingWorker(t *testing.T) { + runTest(t, func(handler func(http.ResponseWriter, *http.Request), _ *httptest.Server, i int) { + req := httptest.NewRequest("GET", "http://example.com/failing-worker.php", nil) + w := httptest.NewRecorder() + handler(w, req) + + resp := w.Result() + body, _ := io.ReadAll(resp.Body) + assert.Contains(t, string(body), "ok") + }, &testOptions{workerScript: "failing-worker.php"}) +} + func TestFileUpload_module(t *testing.T) { testFileUpload(t, &testOptions{}) } func TestFileUpload_worker(t *testing.T) { testFileUpload(t, &testOptions{workerScript: "file-upload.php"}) diff --git a/metrics.go b/metrics.go index 3b08323..25f671f 100644 --- a/metrics.go +++ b/metrics.go @@ -11,11 +11,21 @@ import ( var metricsNameRegex = regexp.MustCompile(`\W+`) var metricsNameFixRegex = regexp.MustCompile(`^_+|_+$`) +const ( + StopReasonCrash = iota + StopReasonRestart + StopReasonShutdown +) + +type StopReason int + type Metrics interface { // StartWorker collects started workers StartWorker(name string) + // ReadyWorker collects ready workers + ReadyWorker(name string) // StopWorker collects stopped workers - StopWorker(name string) + StopWorker(name string, reason StopReason) // TotalWorkers collects expected workers TotalWorkers(name string, num int) // TotalThreads collects total threads @@ -36,7 +46,10 @@ type nullMetrics struct{} func (n nullMetrics) StartWorker(name string) { } -func (n nullMetrics) StopWorker(name string) { +func (n nullMetrics) ReadyWorker(name string) { +} + +func (n nullMetrics) StopWorker(name string, reason StopReason) { } func (n nullMetrics) TotalWorkers(name string, num int) { @@ -66,6 +79,9 @@ type PrometheusMetrics struct { busyThreads prometheus.Gauge totalWorkers map[string]prometheus.Gauge busyWorkers map[string]prometheus.Gauge + readyWorkers map[string]prometheus.Gauge + workerCrashes map[string]prometheus.Counter + workerRestarts map[string]prometheus.Counter workerRequestTime map[string]prometheus.Counter workerRequestCount map[string]prometheus.Counter mu sync.Mutex @@ -81,7 +97,15 @@ func (m *PrometheusMetrics) StartWorker(name string) { m.totalWorkers[name].Inc() } -func (m *PrometheusMetrics) StopWorker(name string) { +func (m *PrometheusMetrics) ReadyWorker(name string) { + if _, ok := m.totalWorkers[name]; !ok { + return + } + + m.readyWorkers[name].Inc() +} + +func (m *PrometheusMetrics) StopWorker(name string, reason StopReason) { m.busyThreads.Dec() // tests do not register workers before starting them @@ -89,6 +113,15 @@ func (m *PrometheusMetrics) StopWorker(name string) { return } m.totalWorkers[name].Dec() + m.readyWorkers[name].Dec() + + if reason == StopReasonCrash { + m.workerCrashes[name].Inc() + } else if reason == StopReasonRestart { + m.workerRestarts[name].Inc() + } else if reason == StopReasonShutdown { + m.totalWorkers[name].Dec() + } } func (m *PrometheusMetrics) getIdentity(name string) (string, error) { @@ -122,6 +155,36 @@ func (m *PrometheusMetrics) TotalWorkers(name string, num int) { m.registry.MustRegister(m.totalWorkers[identity]) } + if _, ok := m.workerCrashes[identity]; !ok { + m.workerCrashes[identity] = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "frankenphp", + Subsystem: subsystem, + Name: "worker_crashes", + Help: "Number of PHP worker crashes for this worker", + }) + m.registry.MustRegister(m.workerCrashes[identity]) + } + + if _, ok := m.workerRestarts[identity]; !ok { + m.workerRestarts[identity] = prometheus.NewCounter(prometheus.CounterOpts{ + Namespace: "frankenphp", + Subsystem: subsystem, + Name: "worker_restarts", + Help: "Number of PHP worker restarts for this worker", + }) + m.registry.MustRegister(m.workerRestarts[identity]) + } + + if _, ok := m.readyWorkers[identity]; !ok { + m.readyWorkers[identity] = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "frankenphp", + Subsystem: subsystem, + Name: "ready_workers", + Help: "Running workers that have successfully called frankenphp_handle_request at least once", + }) + m.registry.MustRegister(m.readyWorkers[identity]) + } + if _, ok := m.busyWorkers[identity]; !ok { m.busyWorkers[identity] = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "frankenphp", @@ -200,6 +263,18 @@ func (m *PrometheusMetrics) Shutdown() { m.registry.Unregister(c) } + for _, c := range m.workerCrashes { + m.registry.Unregister(c) + } + + for _, c := range m.workerRestarts { + m.registry.Unregister(c) + } + + for _, g := range m.readyWorkers { + m.registry.Unregister(g) + } + m.totalThreads = prometheus.NewCounter(prometheus.CounterOpts{ Name: "frankenphp_total_threads", Help: "Total number of PHP threads", @@ -212,6 +287,9 @@ func (m *PrometheusMetrics) Shutdown() { m.busyWorkers = map[string]prometheus.Gauge{} m.workerRequestTime = map[string]prometheus.Counter{} m.workerRequestCount = map[string]prometheus.Counter{} + m.workerRestarts = map[string]prometheus.Counter{} + m.workerCrashes = map[string]prometheus.Counter{} + m.readyWorkers = map[string]prometheus.Gauge{} m.registry.MustRegister(m.totalThreads) m.registry.MustRegister(m.busyThreads) @@ -243,6 +321,9 @@ func NewPrometheusMetrics(registry prometheus.Registerer) *PrometheusMetrics { busyWorkers: map[string]prometheus.Gauge{}, workerRequestTime: map[string]prometheus.Counter{}, workerRequestCount: map[string]prometheus.Counter{}, + workerRestarts: map[string]prometheus.Counter{}, + workerCrashes: map[string]prometheus.Counter{}, + readyWorkers: map[string]prometheus.Gauge{}, } m.registry.MustRegister(m.totalThreads) diff --git a/metrics_test.go b/metrics_test.go index 2e6640d..0522269 100644 --- a/metrics_test.go +++ b/metrics_test.go @@ -38,6 +38,9 @@ func createPrometheusMetrics() *PrometheusMetrics { busyWorkers: make(map[string]prometheus.Gauge), workerRequestTime: make(map[string]prometheus.Counter), workerRequestCount: make(map[string]prometheus.Counter), + workerCrashes: make(map[string]prometheus.Counter), + workerRestarts: make(map[string]prometheus.Counter), + readyWorkers: make(map[string]prometheus.Gauge), mu: sync.Mutex{}, } } diff --git a/testdata/failing-worker.php b/testdata/failing-worker.php new file mode 100644 index 0000000..e14f476 --- /dev/null +++ b/testdata/failing-worker.php @@ -0,0 +1,18 @@ + 0 { + backoff = max(backoff/2, 100*time.Millisecond) + } + }) + }() + // Create main dummy request r, err := http.NewRequest(http.MethodGet, filepath.Base(absFileName), nil) @@ -96,22 +127,58 @@ func startWorkers(fileName string, nbWorkers int, env PreparedEnv) error { // TODO: make the max restart configurable if _, ok := workersRequestChans.Load(absFileName); ok { - metrics.StopWorker(absFileName) + if fc.ready { + fc.ready = false + workersReadyWG.Add(1) + } + workersReadyWG.Add(1) if fc.exitStatus == 0 { if c := l.Check(zapcore.InfoLevel, "restarting"); c != nil { c.Write(zap.String("worker", absFileName)) } + + // a normal restart resets the backoff and failure count + backingOffLock.Lock() + backoff = minBackoff + failureCount = 0 + backingOffLock.Unlock() + metrics.StopWorker(absFileName, StopReasonRestart) } else { if c := l.Check(zapcore.ErrorLevel, "unexpected termination, restarting"); c != nil { - c.Write(zap.String("worker", absFileName), zap.Int("exit_status", int(fc.exitStatus))) + backingOffLock.RLock() + c.Write(zap.String("worker", absFileName), zap.Int("failure_count", failureCount), zap.Int("exit_status", int(fc.exitStatus)), zap.Duration("waiting", backoff)) + backingOffLock.RUnlock() } + + upFunc.Do(func() { + backingOffLock.Lock() + defer backingOffLock.Unlock() + // if we end up here, the worker has not been up for backoff*2 + // this is probably due to a syntax error or another fatal error + if failureCount >= maxConsecutiveFailures { + panic(fmt.Errorf("workers %q: too many consecutive failures", absFileName)) + } else { + failureCount += 1 + } + }) + backingOffLock.RLock() + wait := backoff + backingOffLock.RUnlock() + time.Sleep(wait) + backingOffLock.Lock() + backoff *= 2 + backoff = min(backoff, maxBackoff) + backingOffLock.Unlock() + metrics.StopWorker(absFileName, StopReasonCrash) } } else { break } } + metrics.StopWorker(absFileName, StopReasonShutdown) + // TODO: check if the termination is expected if c := l.Check(zapcore.DebugLevel, "terminated"); c != nil { c.Write(zap.String("worker", absFileName)) @@ -139,7 +206,11 @@ func stopWorkers() { } //export go_frankenphp_worker_ready -func go_frankenphp_worker_ready() { +func go_frankenphp_worker_ready(mrh C.uintptr_t) { + mainRequest := cgo.Handle(mrh).Value().(*http.Request) + fc := mainRequest.Context().Value(contextKey).(*FrankenPHPContext) + fc.ready = true + metrics.ReadyWorker(fc.scriptFilename) workersReadyWG.Done() }