feat: expose fiber responsiveness metrics (#2125)

Should allow track caches where Dragonfly is not responsive to I/O
due to big CPU tasks. Also, update the local grafana dashboard.

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
This commit is contained in:
Roman Gershman 2023-11-05 16:56:33 +02:00 committed by GitHub
parent b9781c4903
commit c7db025a48
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 143 additions and 21 deletions

2
helio

@ -1 +1 @@
Subproject commit fe7ec28642c1b699bdc8839296f354d797ee0365 Subproject commit 1fea6effc72919649c815afb04e9c7829b0240ab

View File

@ -850,7 +850,7 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
AppendMetricValue("role", 1, {"role"}, {m.is_master ? "master" : "replica"}, &resp->body()); AppendMetricValue("role", 1, {"role"}, {m.is_master ? "master" : "replica"}, &resp->body());
AppendMetricWithoutLabels("master", "1 if master 0 if replica", m.is_master ? 1 : 0, AppendMetricWithoutLabels("master", "1 if master 0 if replica", m.is_master ? 1 : 0,
MetricType::GAUGE, &resp->body()); MetricType::GAUGE, &resp->body());
AppendMetricWithoutLabels("uptime_in_seconds", "", m.uptime, MetricType::GAUGE, &resp->body()); AppendMetricWithoutLabels("uptime_in_seconds", "", m.uptime, MetricType::COUNTER, &resp->body());
// Clients metrics // Clients metrics
AppendMetricWithoutLabels("connected_clients", "", m.conn_stats.num_conns, MetricType::GAUGE, AppendMetricWithoutLabels("connected_clients", "", m.conn_stats.num_conns, MetricType::GAUGE,
@ -923,7 +923,7 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
&command_metrics); &command_metrics);
for (const auto& [name, stat] : m.cmd_stats_map) { for (const auto& [name, stat] : m.cmd_stats_map) {
const auto calls = stat.first; const auto calls = stat.first;
const auto duration_seconds = stat.second * 0.001; const double duration_seconds = stat.second * 0.001;
AppendMetricValue("commands_total", calls, {"cmd"}, {name}, &command_metrics); AppendMetricValue("commands_total", calls, {"cmd"}, {name}, &command_metrics);
AppendMetricValue("commands_duration_seconds_total", duration_seconds, {"cmd"}, {name}, AppendMetricValue("commands_duration_seconds_total", duration_seconds, {"cmd"}, {name},
&command_metrics); &command_metrics);
@ -944,6 +944,18 @@ void PrintPrometheusMetrics(const Metrics& m, StringResponse* resp) {
absl::StrAppend(&resp->body(), replication_lag_metrics); absl::StrAppend(&resp->body(), replication_lag_metrics);
} }
AppendMetricWithoutLabels("fiber_switch_total", "", m.fiber_switch_cnt, MetricType::COUNTER,
&resp->body());
double delay_seconds = m.fiber_switch_delay_ns * 1e-9;
AppendMetricWithoutLabels("fiber_switch_delay_seconds_total", "", delay_seconds,
MetricType::COUNTER, &resp->body());
AppendMetricWithoutLabels("fiber_longrun_total", "", m.fiber_longrun_cnt, MetricType::COUNTER,
&resp->body());
double longrun_seconds = m.fiber_longrun_ns * 1e-9;
AppendMetricWithoutLabels("fiber_longrun_seconds_total", "", longrun_seconds, MetricType::COUNTER,
&resp->body());
absl::StrAppend(&resp->body(), db_key_metrics); absl::StrAppend(&resp->body(), db_key_metrics);
absl::StrAppend(&resp->body(), db_key_expire_metrics); absl::StrAppend(&resp->body(), db_key_expire_metrics);
} }
@ -1402,6 +1414,11 @@ Metrics ServerFamily::GetMetrics() const {
lock_guard lk(mu); lock_guard lk(mu);
result.fiber_switch_cnt += fb2::FiberSwitchEpoch();
result.fiber_switch_delay_ns += fb2::FiberSwitchDelay();
result.fiber_longrun_cnt += fb2::FiberLongRunCnt();
result.fiber_longrun_ns += fb2::FiberLongRunSum();
result.coordinator_stats += ss->stats; result.coordinator_stats += ss->stats;
result.conn_stats += ss->connection_stats; result.conn_stats += ss->connection_stats;

View File

@ -86,6 +86,12 @@ struct Metrics {
size_t small_string_bytes = 0; size_t small_string_bytes = 0;
uint32_t traverse_ttl_per_sec = 0; uint32_t traverse_ttl_per_sec = 0;
uint32_t delete_ttl_per_sec = 0; uint32_t delete_ttl_per_sec = 0;
uint64_t fiber_switch_cnt = 0;
uint64_t fiber_switch_delay_ns = 0;
// Statistics about fibers running for a long time (more than 1ms).
uint64_t fiber_longrun_cnt = 0;
uint64_t fiber_longrun_ns = 0;
std::map<std::string, std::pair<uint64_t, uint64_t>> cmd_stats_map; // command call frequencies std::map<std::string, std::pair<uint64_t, uint64_t>> cmd_stats_map; // command call frequencies

View File

@ -105,7 +105,7 @@
}, },
"textMode": "auto" "textMode": "auto"
}, },
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -191,7 +191,7 @@
}, },
"textMode": "auto" "textMode": "auto"
}, },
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -282,7 +282,7 @@
"showThresholdLabels": false, "showThresholdLabels": false,
"showThresholdMarkers": true "showThresholdMarkers": true
}, },
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"targets": [ "targets": [
{ {
"datasource": { "datasource": {
@ -350,7 +350,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 5, "pointradius": 5,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -456,7 +456,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 5, "pointradius": 5,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -574,7 +574,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 5, "pointradius": 5,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -690,7 +690,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 5, "pointradius": 5,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -791,7 +791,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 5, "pointradius": 5,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -912,7 +912,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 5, "pointradius": 5,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -1033,7 +1033,7 @@
"alertThreshold": true "alertThreshold": true
}, },
"percentage": false, "percentage": false,
"pluginVersion": "10.2.0", "pluginVersion": "9.3.6",
"pointradius": 2, "pointradius": 2,
"points": false, "points": false,
"renderer": "flot", "renderer": "flot",
@ -1110,7 +1110,6 @@
"mode": "palette-classic" "mode": "palette-classic"
}, },
"custom": { "custom": {
"axisBorderShow": false,
"axisCenteredZero": false, "axisCenteredZero": false,
"axisColorMode": "text", "axisColorMode": "text",
"axisLabel": "", "axisLabel": "",
@ -1124,7 +1123,6 @@
"tooltip": false, "tooltip": false,
"viz": false "viz": false
}, },
"insertNulls": false,
"lineInterpolation": "linear", "lineInterpolation": "linear",
"lineWidth": 1, "lineWidth": 1,
"pointSize": 5, "pointSize": 5,
@ -1155,7 +1153,7 @@
} }
] ]
}, },
"unit": "µs" "unit": "s"
}, },
"overrides": [] "overrides": []
}, },
@ -1187,7 +1185,107 @@
"disableTextWrap": false, "disableTextWrap": false,
"editorMode": "code", "editorMode": "code",
"expr": "expr":
"rate(dragonfly_fiber_switch_delay_seconds_total[$__rate_interval])*1000000/rate(dragonfly_fiber_switch_total[$__rate_interval])", "rate(dragonfly_fiber_switch_delay_seconds_total[$__rate_interval])/rate(dragonfly_fiber_switch_total[$__rate_interval])",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "FiberSwitchDelay",
"transformations": [],
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 30
},
"id": 20,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"disableTextWrap": false,
"editorMode": "code",
"expr":
"rate(dragonfly_fiber_longrun_seconds_total[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])",
"fullMetaSearch": false, "fullMetaSearch": false,
"includeNullMetadata": false, "includeNullMetadata": false,
"instant": false, "instant": false,
@ -1203,7 +1301,8 @@
} }
], ],
"refresh": "", "refresh": "",
"schemaVersion": 38, "schemaVersion": 37,
"style": "dark",
"tags": [ "tags": [
"prometheus", "prometheus",
"dragonfly" "dragonfly"

View File

@ -29,14 +29,14 @@ scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config. # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: dragonfly - job_name: dragonfly
scrape_interval: 5s scrape_interval: 1s
static_configs: static_configs:
- targets: ['host.docker.internal:6379'] - targets: ['host.docker.internal:6379']
- job_name: 'prometheus' - job_name: 'prometheus'
# Override the global default and scrape targets from this job every 5 seconds. # Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s scrape_interval: 1s
static_configs: static_configs:
- targets: ['localhost:9090'] - targets: ['localhost:9090']
@ -45,7 +45,7 @@ scrape_configs:
- job_name: 'node-exporter' - job_name: 'node-exporter'
# Override the global default and scrape targets from this job every 5 seconds. # Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s scrape_interval: 1s
static_configs: static_configs:
- targets: ['node-exporter:9100'] - targets: ['node-exporter:9100']
labels: labels: