From c7db025a4811ebf99c3be2f9388e1019aa10cfe5 Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Sun, 5 Nov 2023 16:56:33 +0200 Subject: [PATCH] feat: expose fiber responsiveness metrics (#2125) Should allow track caches where Dragonfly is not responsive to I/O due to big CPU tasks. Also, update the local grafana dashboard. Signed-off-by: Roman Gershman --- helio | 2 +- src/server/server_family.cc | 21 ++- src/server/server_family.h | 6 + .../provisioning/dashboards/dashboard.json | 129 ++++++++++++++++-- .../monitoring/prometheus/prometheus.yml | 6 +- 5 files changed, 143 insertions(+), 21 deletions(-) diff --git a/helio b/helio index fe7ec2864..1fea6effc 160000 --- a/helio +++ b/helio @@ -1 +1 @@ -Subproject commit fe7ec28642c1b699bdc8839296f354d797ee0365 +Subproject commit 1fea6effc72919649c815afb04e9c7829b0240ab diff --git a/src/server/server_family.cc b/src/server/server_family.cc index 1c64ca895..38544d89f 100644 --- a/src/server/server_family.cc +++ b/src/server/server_family.cc @@ -850,7 +850,7 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) { AppendMetricValue("role", 1, {"role"}, {m.is_master ? "master" : "replica"}, &resp->body()); AppendMetricWithoutLabels("master", "1 if master 0 if replica", m.is_master ? 1 : 0, MetricType::GAUGE, &resp->body()); - AppendMetricWithoutLabels("uptime_in_seconds", "", m.uptime, MetricType::GAUGE, &resp->body()); + AppendMetricWithoutLabels("uptime_in_seconds", "", m.uptime, MetricType::COUNTER, &resp->body()); // Clients metrics AppendMetricWithoutLabels("connected_clients", "", m.conn_stats.num_conns, MetricType::GAUGE, @@ -923,7 +923,7 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) { &command_metrics); for (const auto& [name, stat] : m.cmd_stats_map) { const auto calls = stat.first; - const auto duration_seconds = stat.second * 0.001; + const double duration_seconds = stat.second * 0.001; AppendMetricValue("commands_total", calls, {"cmd"}, {name}, &command_metrics); AppendMetricValue("commands_duration_seconds_total", duration_seconds, {"cmd"}, {name}, &command_metrics); @@ -944,6 +944,18 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) { absl::StrAppend(&resp->body(), replication_lag_metrics); } + AppendMetricWithoutLabels("fiber_switch_total", "", m.fiber_switch_cnt, MetricType::COUNTER, + &resp->body()); + double delay_seconds = m.fiber_switch_delay_ns * 1e-9; + AppendMetricWithoutLabels("fiber_switch_delay_seconds_total", "", delay_seconds, + MetricType::COUNTER, &resp->body()); + + AppendMetricWithoutLabels("fiber_longrun_total", "", m.fiber_longrun_cnt, MetricType::COUNTER, + &resp->body()); + double longrun_seconds = m.fiber_longrun_ns * 1e-9; + AppendMetricWithoutLabels("fiber_longrun_seconds_total", "", longrun_seconds, MetricType::COUNTER, + &resp->body()); + absl::StrAppend(&resp->body(), db_key_metrics); absl::StrAppend(&resp->body(), db_key_expire_metrics); } @@ -1402,6 +1414,11 @@ Metrics ServerFamily::GetMetrics() const { lock_guard lk(mu); + result.fiber_switch_cnt += fb2::FiberSwitchEpoch(); + result.fiber_switch_delay_ns += fb2::FiberSwitchDelay(); + result.fiber_longrun_cnt += fb2::FiberLongRunCnt(); + result.fiber_longrun_ns += fb2::FiberLongRunSum(); + result.coordinator_stats += ss->stats; result.conn_stats += ss->connection_stats; diff --git a/src/server/server_family.h b/src/server/server_family.h index c168b0878..dfff5875b 100644 --- a/src/server/server_family.h +++ b/src/server/server_family.h @@ -86,6 +86,12 @@ struct Metrics { size_t small_string_bytes = 0; uint32_t traverse_ttl_per_sec = 0; uint32_t delete_ttl_per_sec = 0; + uint64_t fiber_switch_cnt = 0; + uint64_t fiber_switch_delay_ns = 0; + + // Statistics about fibers running for a long time (more than 1ms). + uint64_t fiber_longrun_cnt = 0; + uint64_t fiber_longrun_ns = 0; std::map> cmd_stats_map; // command call frequencies diff --git a/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json b/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json index 64a58ee0b..3a7575795 100644 --- a/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json +++ b/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json @@ -105,7 +105,7 @@ }, "textMode": "auto" }, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "targets": [ { "datasource": { @@ -191,7 +191,7 @@ }, "textMode": "auto" }, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "targets": [ { "datasource": { @@ -282,7 +282,7 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "targets": [ { "datasource": { @@ -350,7 +350,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 5, "points": false, "renderer": "flot", @@ -456,7 +456,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 5, "points": false, "renderer": "flot", @@ -574,7 +574,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 5, "points": false, "renderer": "flot", @@ -690,7 +690,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 5, "points": false, "renderer": "flot", @@ -791,7 +791,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 5, "points": false, "renderer": "flot", @@ -912,7 +912,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 5, "points": false, "renderer": "flot", @@ -1033,7 +1033,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "10.2.0", + "pluginVersion": "9.3.6", "pointradius": 2, "points": false, "renderer": "flot", @@ -1110,7 +1110,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -1124,7 +1123,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -1155,7 +1153,7 @@ } ] }, - "unit": "µs" + "unit": "s" }, "overrides": [] }, @@ -1187,7 +1185,107 @@ "disableTextWrap": false, "editorMode": "code", "expr": - "rate(dragonfly_fiber_switch_delay_seconds_total[$__rate_interval])*1000000/rate(dragonfly_fiber_switch_total[$__rate_interval])", + "rate(dragonfly_fiber_switch_delay_seconds_total[$__rate_interval])/rate(dragonfly_fiber_switch_total[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "FiberSwitchDelay", + "transformations": [], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": + "rate(dragonfly_fiber_longrun_seconds_total[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -1203,7 +1301,8 @@ } ], "refresh": "", - "schemaVersion": 38, + "schemaVersion": 37, + "style": "dark", "tags": [ "prometheus", "dragonfly" diff --git a/tools/local/monitoring/prometheus/prometheus.yml b/tools/local/monitoring/prometheus/prometheus.yml index 67eb6b45e..6944a1696 100644 --- a/tools/local/monitoring/prometheus/prometheus.yml +++ b/tools/local/monitoring/prometheus/prometheus.yml @@ -29,14 +29,14 @@ scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. - job_name: dragonfly - scrape_interval: 5s + scrape_interval: 1s static_configs: - targets: ['host.docker.internal:6379'] - job_name: 'prometheus' # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s + scrape_interval: 1s static_configs: - targets: ['localhost:9090'] @@ -45,7 +45,7 @@ scrape_configs: - job_name: 'node-exporter' # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s + scrape_interval: 1s static_configs: - targets: ['node-exporter:9100'] labels: