From 0394387a5ffeee1a4ffe93e148827215ebdfe4fa Mon Sep 17 00:00:00 2001 From: Roman Gershman Date: Thu, 30 May 2024 19:10:35 +0300 Subject: [PATCH] chore: export pipeline related metrics (#3104) * chore: export pipeline related metrics Export in /metrics 1. Total pipeline queue length 2. Total pipeline commands 3. Total pipelined duration Signed-off-by: Roman Gershman --------- Signed-off-by: Roman Gershman --- .github/workflows/ci.yml | 4 + src/facade/conn_context.h | 8 +- src/facade/dragonfly_connection.cc | 28 ++-- src/facade/dragonfly_connection.h | 19 ++- src/facade/facade_types.h | 4 +- src/server/server_family.cc | 11 +- tools/local/monitoring/docker-compose.yml | 2 +- .../provisioning/dashboards/dashboard.json | 143 +++++++++++++++--- .../monitoring/prometheus/prometheus.yml | 7 +- 9 files changed, 169 insertions(+), 57 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 52176dc46..24db5c607 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,10 @@ on: branches: [main] workflow_dispatch: +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: pre-commit: runs-on: ubuntu-latest diff --git a/src/facade/conn_context.h b/src/facade/conn_context.h index b5bce8b4f..b58bef253 100644 --- a/src/facade/conn_context.h +++ b/src/facade/conn_context.h @@ -97,10 +97,14 @@ class ConnectionContext { bool async_dispatch : 1; // whether this connection is amid an async dispatch bool sync_dispatch : 1; // whether this connection is amid a sync dispatch bool journal_emulated : 1; // whether it is used to dispatch journal commands - bool paused = false; // whether this connection is paused due to CLIENT PAUSE + + bool paused = false; // whether this connection is paused due to CLIENT PAUSE // whether it's blocked on blocking commands like BLPOP, needs to be addressable bool blocked = false; + // Skip ACL validation, used by internal commands and commands run on admin port + bool skip_acl_validation = false; + // How many async subscription sources are active: monitor and/or pubsub - at most 2. uint8_t subscriptions; @@ -108,8 +112,6 @@ class ConnectionContext { std::string authed_username{"default"}; uint32_t acl_categories{dfly::acl::ALL}; std::vector acl_commands; - // Skip ACL validation, used by internal commands and commands run on admin port - bool skip_acl_validation = false; // keys dfly::acl::AclKeys keys{{}, true}; diff --git a/src/facade/dragonfly_connection.cc b/src/facade/dragonfly_connection.cc index 2d3cb8cdd..09a3fbeea 100644 --- a/src/facade/dragonfly_connection.cc +++ b/src/facade/dragonfly_connection.cc @@ -393,19 +393,6 @@ size_t Connection::MessageHandle::UsedMemory() const { return sizeof(MessageHandle) + visit(MessageSize{}, this->handle); } -bool Connection::MessageHandle::IsIntrusive() const { - return holds_alternative(handle) || - holds_alternative(handle); -} - -bool Connection::MessageHandle::IsPipelineMsg() const { - return holds_alternative(handle); -} - -bool Connection::MessageHandle::IsPubMsg() const { - return holds_alternative(handle); -} - bool Connection::MessageHandle::IsReplying() const { return IsPipelineMsg() || IsPubMsg() || holds_alternative(handle) || (holds_alternative(handle) && @@ -751,6 +738,9 @@ std::pair Connection::GetClientInfoBeforeAfterTid() co string after; absl::StrAppend(&after, " irqmatch=", int(cpu == my_cpu_id)); + if (dispatch_q_.size()) { + absl::StrAppend(&after, " pipeline=", dispatch_q_.size()); + } absl::StrAppend(&after, " age=", now - creation_time_, " idle=", now - last_interaction_); string_view phase_name = PHASE_NAMES[phase_]; @@ -1272,7 +1262,7 @@ void Connection::SquashPipeline(facade::SinkReplyBuilder* builder) { cc_->async_dispatch = false; auto it = dispatch_q_.begin(); - while (it->IsIntrusive()) // Skip all newly received intrusive messages + while (it->IsControl()) // Skip all newly received intrusive messages ++it; for (auto rit = it; rit != it + dispatched; ++rit) @@ -1291,7 +1281,7 @@ void Connection::ClearPipelinedMessages() { // As well as to avoid pubsub backpressure leakege. for (auto& msg : dispatch_q_) { FiberAtomicGuard guard; // don't suspend when concluding to avoid getting new messages - if (msg.IsIntrusive()) + if (msg.IsControl()) visit(dispatch_op, msg.handle); // to not miss checkpoints RecycleMessage(std::move(msg)); } @@ -1309,7 +1299,7 @@ std::string Connection::DebugInfo() const { absl::StrAppend(&info, "closing=", cc_->conn_closing, ", "); absl::StrAppend(&info, "dispatch_fiber:joinable=", dispatch_fb_.IsJoinable(), ", "); - bool intrusive_front = dispatch_q_.size() > 0 && dispatch_q_.front().IsIntrusive(); + bool intrusive_front = dispatch_q_.size() > 0 && dispatch_q_.front().IsControl(); absl::StrAppend(&info, "dispatch_queue:size=", dispatch_q_.size(), ", "); absl::StrAppend(&info, "dispatch_queue:pipelined=", pending_pipeline_cmd_cnt_, ", "); absl::StrAppend(&info, "dispatch_queue:intrusive=", intrusive_front, ", "); @@ -1549,7 +1539,7 @@ void Connection::SendAsync(MessageHandle msg) { // "Closing" connections might be still processing commands, as we don't interrupt them. // So we still want to deliver control messages to them (like checkpoints). - if (cc_->conn_closing && !msg.IsIntrusive()) + if (cc_->conn_closing && !msg.IsControl()) return; // If we launch while closing, it won't be awaited. Control messages will be processed on cleanup. @@ -1573,9 +1563,9 @@ void Connection::SendAsync(MessageHandle msg) { pending_pipeline_cmd_cnt_++; } - if (msg.IsIntrusive()) { + if (msg.IsControl()) { auto it = dispatch_q_.begin(); - while (it < dispatch_q_.end() && it->IsIntrusive()) + while (it < dispatch_q_.end() && it->IsControl()) ++it; dispatch_q_.insert(it, std::move(msg)); } else { diff --git a/src/facade/dragonfly_connection.h b/src/facade/dragonfly_connection.h index 25b755a3f..929ad5f8b 100644 --- a/src/facade/dragonfly_connection.h +++ b/src/facade/dragonfly_connection.h @@ -147,12 +147,21 @@ class Connection : public util::Connection { struct MessageHandle { size_t UsedMemory() const; // How much bytes this handle takes up in total. - // Intrusive messages put themselves at the front of the queue, but only after all other - // intrusive ones. Used for quick transfer of control / update messages. - bool IsIntrusive() const; + // Control messages put themselves at the front of the queue, but only after all other + // control ones. Used for management messages. + bool IsControl() const { + return std::holds_alternative(handle) || + std::holds_alternative(handle); + } + + bool IsPipelineMsg() const { + return std::holds_alternative(handle); + } + + bool IsPubMsg() const { + return std::holds_alternative(handle); + } - bool IsPipelineMsg() const; - bool IsPubMsg() const; bool IsReplying() const; // control messges don't reply, messages carrying data do std::variantbody()); AppendMetricWithoutLabels("blocked_clients", "", conn_stats.num_blocked_clients, MetricType::GAUGE, &resp->body()); - AppendMetricWithoutLabels("dispatch_queue_bytes", "", conn_stats.dispatch_queue_bytes, + AppendMetricWithoutLabels("pipeline_queue_bytes", "", conn_stats.dispatch_queue_bytes, + MetricType::GAUGE, &resp->body()); + AppendMetricWithoutLabels("pipeline_queue_length", "", conn_stats.dispatch_queue_entries, MetricType::GAUGE, &resp->body()); AppendMetricWithoutLabels("pipeline_cmd_cache_bytes", "", conn_stats.pipeline_cmd_cache_bytes, MetricType::GAUGE, &resp->body()); + AppendMetricWithoutLabels("pipeline_commands_total", "", conn_stats.pipelined_cmd_cnt, + MetricType::COUNTER, &resp->body()); + AppendMetricWithoutLabels("pipeline_commands_duration_seconds", "", + conn_stats.pipelined_cmd_latency * 1e-6, MetricType::COUNTER, + &resp->body()); // Memory metrics auto sdata_res = io::ReadStatusInfo(); @@ -1977,7 +1984,7 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) { append("max_clients", GetFlag(FLAGS_maxclients)); append("client_read_buffer_bytes", m.facade_stats.conn_stats.read_buf_capacity); append("blocked_clients", m.facade_stats.conn_stats.num_blocked_clients); - append("dispatch_queue_entries", m.facade_stats.conn_stats.dispatch_queue_entries); + append("pipeline_queue_length", m.facade_stats.conn_stats.dispatch_queue_entries); } if (should_enter("MEMORY")) { diff --git a/tools/local/monitoring/docker-compose.yml b/tools/local/monitoring/docker-compose.yml index eaf09ec0d..6501c83e5 100644 --- a/tools/local/monitoring/docker-compose.yml +++ b/tools/local/monitoring/docker-compose.yml @@ -6,7 +6,7 @@ volumes: services: prometheus: - image: prom/prometheus + image: prom/prometheus:v2.45.5 restart: always volumes: - ./prometheus:/etc/prometheus/ diff --git a/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json b/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json index c12d8d7b4..9c7c4d680 100644 --- a/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json +++ b/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json @@ -105,7 +105,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.3.6", + "pluginVersion": "10.1.10", "targets": [ { "datasource": { @@ -191,7 +191,7 @@ }, "textMode": "auto" }, - "pluginVersion": "9.3.6", + "pluginVersion": "10.1.10", "targets": [ { "datasource": { @@ -282,7 +282,7 @@ "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "9.3.6", + "pluginVersion": "10.1.10", "targets": [ { "datasource": { @@ -1609,10 +1609,24 @@ "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, - "legendFormat": "__auto", + "legendFormat": "switch", "range": true, "refId": "A", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": + "rate(dragonfly_fiber_longrun_seconds[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "longrun", + "range": true, + "refId": "B" } ], "title": "FiberSwitchDelay", @@ -1622,7 +1636,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "PBFA97CFB590B2093" }, "fieldConfig": { "defaults": { @@ -1673,8 +1687,7 @@ "value": 80 } ] - }, - "unit": "s" + } }, "overrides": [] }, @@ -1684,7 +1697,7 @@ "x": 12, "y": 30 }, - "id": 20, + "id": 22, "options": { "legend": { "calcs": [], @@ -1703,21 +1716,15 @@ "type": "prometheus", "uid": "PBFA97CFB590B2093" }, - "disableTextWrap": false, "editorMode": "code", - "expr": - "rate(dragonfly_fiber_longrun_seconds_total[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])", - "fullMetaSearch": false, - "includeNullMetadata": false, + "expr": "dragonfly_pipeline_queue_length/dragonfly_connected_clients", "instant": false, - "legendFormat": "__auto", + "legendFormat": "avr_pipeline_depth", "range": true, - "refId": "A", - "useBackend": false + "refId": "A" } ], - "title": "FiberSwitchDelay", - "transformations": [], + "title": "Pipeline length", "type": "timeseries" }, { @@ -1826,6 +1833,102 @@ ], "title": "Master Replication memory", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": + "rate(dragonfly_pipeline_commands_duration_seconds[$__rate_interval])/rate(dragonfly_pipeline_commands_total[$__rate_interval])", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Pipeline latency", + "type": "timeseries" } ], "refresh": "1m", @@ -1948,7 +2051,7 @@ ] }, "time": { - "from": "now-5m", + "from": "now-15m", "to": "now" }, "timepicker": { @@ -1979,6 +2082,6 @@ "timezone": "browser", "title": "Dragonfly Dashboard", "uid": "xDLNRKUWz", - "version": 1, + "version": 4, "weekStart": "" } diff --git a/tools/local/monitoring/prometheus/prometheus.yml b/tools/local/monitoring/prometheus/prometheus.yml index 6944a1696..8c6ab22d2 100644 --- a/tools/local/monitoring/prometheus/prometheus.yml +++ b/tools/local/monitoring/prometheus/prometheus.yml @@ -1,8 +1,7 @@ # my global config global: - scrape_interval: 15s # By default, scrape targets every 15 seconds. - evaluation_interval: 15s # By default, scrape targets every 15 seconds. - # scrape_timeout is set to the global default (10s). + scrape_interval: 5s + evaluation_interval: 5s # Attach these labels to any time series or alerts when communicating with # external systems (federation, remote storage, Alertmanager). @@ -43,8 +42,6 @@ scrape_configs: - job_name: 'node-exporter' - - # Override the global default and scrape targets from this job every 5 seconds. scrape_interval: 1s static_configs: - targets: ['node-exporter:9100']