From 0394387a5ffeee1a4ffe93e148827215ebdfe4fa Mon Sep 17 00:00:00 2001
From: Roman Gershman <roman@dragonflydb.io>
Date: Thu, 30 May 2024 19:10:35 +0300
Subject: [PATCH] chore: export pipeline related metrics (#3104)

* chore: export pipeline related metrics

Export in /metrics
1. Total pipeline queue length
2. Total pipeline commands
3. Total pipelined duration

Signed-off-by: Roman Gershman <roman@dragonflydb.io>

---------

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
---
 .github/workflows/ci.yml                      |   4 +
 src/facade/conn_context.h                     |   8 +-
 src/facade/dragonfly_connection.cc            |  28 ++--
 src/facade/dragonfly_connection.h             |  19 ++-
 src/facade/facade_types.h                     |   4 +-
 src/server/server_family.cc                   |  11 +-
 tools/local/monitoring/docker-compose.yml     |   2 +-
 .../provisioning/dashboards/dashboard.json    | 143 +++++++++++++++---
 .../monitoring/prometheus/prometheus.yml      |   7 +-
 9 files changed, 169 insertions(+), 57 deletions(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 52176dc46..24db5c607 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,10 @@ on:
     branches: [main]
   workflow_dispatch:
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   pre-commit:
     runs-on: ubuntu-latest
diff --git a/src/facade/conn_context.h b/src/facade/conn_context.h
index b5bce8b4f..b58bef253 100644
--- a/src/facade/conn_context.h
+++ b/src/facade/conn_context.h
@@ -97,10 +97,14 @@ class ConnectionContext {
   bool async_dispatch : 1;    // whether this connection is amid an async dispatch
   bool sync_dispatch : 1;     // whether this connection is amid a sync dispatch
   bool journal_emulated : 1;  // whether it is used to dispatch journal commands
-  bool paused = false;        // whether this connection is paused due to CLIENT PAUSE
+
+  bool paused = false;  // whether this connection is paused due to CLIENT PAUSE
   // whether it's blocked on blocking commands like BLPOP, needs to be addressable
   bool blocked = false;
 
+  // Skip ACL validation, used by internal commands and commands run on admin port
+  bool skip_acl_validation = false;
+
   // How many async subscription sources are active: monitor and/or pubsub - at most 2.
   uint8_t subscriptions;
 
@@ -108,8 +112,6 @@ class ConnectionContext {
   std::string authed_username{"default"};
   uint32_t acl_categories{dfly::acl::ALL};
   std::vector<uint64_t> acl_commands;
-  // Skip ACL validation, used by internal commands and commands run on admin port
-  bool skip_acl_validation = false;
   // keys
   dfly::acl::AclKeys keys{{}, true};
 
diff --git a/src/facade/dragonfly_connection.cc b/src/facade/dragonfly_connection.cc
index 2d3cb8cdd..09a3fbeea 100644
--- a/src/facade/dragonfly_connection.cc
+++ b/src/facade/dragonfly_connection.cc
@@ -393,19 +393,6 @@ size_t Connection::MessageHandle::UsedMemory() const {
   return sizeof(MessageHandle) + visit(MessageSize{}, this->handle);
 }
 
-bool Connection::MessageHandle::IsIntrusive() const {
-  return holds_alternative<AclUpdateMessagePtr>(handle) ||
-         holds_alternative<CheckpointMessage>(handle);
-}
-
-bool Connection::MessageHandle::IsPipelineMsg() const {
-  return holds_alternative<PipelineMessagePtr>(handle);
-}
-
-bool Connection::MessageHandle::IsPubMsg() const {
-  return holds_alternative<PubMessagePtr>(handle);
-}
-
 bool Connection::MessageHandle::IsReplying() const {
   return IsPipelineMsg() || IsPubMsg() || holds_alternative<MonitorMessage>(handle) ||
          (holds_alternative<MCPipelineMessagePtr>(handle) &&
@@ -751,6 +738,9 @@ std::pair<std::string, std::string> Connection::GetClientInfoBeforeAfterTid() co
 
   string after;
   absl::StrAppend(&after, " irqmatch=", int(cpu == my_cpu_id));
+  if (dispatch_q_.size()) {
+    absl::StrAppend(&after, " pipeline=", dispatch_q_.size());
+  }
   absl::StrAppend(&after, " age=", now - creation_time_, " idle=", now - last_interaction_);
   string_view phase_name = PHASE_NAMES[phase_];
 
@@ -1272,7 +1262,7 @@ void Connection::SquashPipeline(facade::SinkReplyBuilder* builder) {
   cc_->async_dispatch = false;
 
   auto it = dispatch_q_.begin();
-  while (it->IsIntrusive())  // Skip all newly received intrusive messages
+  while (it->IsControl())  // Skip all newly received intrusive messages
     ++it;
 
   for (auto rit = it; rit != it + dispatched; ++rit)
@@ -1291,7 +1281,7 @@ void Connection::ClearPipelinedMessages() {
   // As well as to avoid pubsub backpressure leakege.
   for (auto& msg : dispatch_q_) {
     FiberAtomicGuard guard;  // don't suspend when concluding to avoid getting new messages
-    if (msg.IsIntrusive())
+    if (msg.IsControl())
       visit(dispatch_op, msg.handle);  // to not miss checkpoints
     RecycleMessage(std::move(msg));
   }
@@ -1309,7 +1299,7 @@ std::string Connection::DebugInfo() const {
   absl::StrAppend(&info, "closing=", cc_->conn_closing, ", ");
   absl::StrAppend(&info, "dispatch_fiber:joinable=", dispatch_fb_.IsJoinable(), ", ");
 
-  bool intrusive_front = dispatch_q_.size() > 0 && dispatch_q_.front().IsIntrusive();
+  bool intrusive_front = dispatch_q_.size() > 0 && dispatch_q_.front().IsControl();
   absl::StrAppend(&info, "dispatch_queue:size=", dispatch_q_.size(), ", ");
   absl::StrAppend(&info, "dispatch_queue:pipelined=", pending_pipeline_cmd_cnt_, ", ");
   absl::StrAppend(&info, "dispatch_queue:intrusive=", intrusive_front, ", ");
@@ -1549,7 +1539,7 @@ void Connection::SendAsync(MessageHandle msg) {
 
   // "Closing" connections might be still processing commands, as we don't interrupt them.
   // So we still want to deliver control messages to them (like checkpoints).
-  if (cc_->conn_closing && !msg.IsIntrusive())
+  if (cc_->conn_closing && !msg.IsControl())
     return;
 
   // If we launch while closing, it won't be awaited. Control messages will be processed on cleanup.
@@ -1573,9 +1563,9 @@ void Connection::SendAsync(MessageHandle msg) {
     pending_pipeline_cmd_cnt_++;
   }
 
-  if (msg.IsIntrusive()) {
+  if (msg.IsControl()) {
     auto it = dispatch_q_.begin();
-    while (it < dispatch_q_.end() && it->IsIntrusive())
+    while (it < dispatch_q_.end() && it->IsControl())
       ++it;
     dispatch_q_.insert(it, std::move(msg));
   } else {
diff --git a/src/facade/dragonfly_connection.h b/src/facade/dragonfly_connection.h
index 25b755a3f..929ad5f8b 100644
--- a/src/facade/dragonfly_connection.h
+++ b/src/facade/dragonfly_connection.h
@@ -147,12 +147,21 @@ class Connection : public util::Connection {
   struct MessageHandle {
     size_t UsedMemory() const;  // How much bytes this handle takes up in total.
 
-    // Intrusive messages put themselves at the front of the queue, but only after all other
-    // intrusive ones. Used for quick transfer of control / update messages.
-    bool IsIntrusive() const;
+    // Control messages put themselves at the front of the queue, but only after all other
+    // control ones. Used for management messages.
+    bool IsControl() const {
+      return std::holds_alternative<AclUpdateMessagePtr>(handle) ||
+             std::holds_alternative<CheckpointMessage>(handle);
+    }
+
+    bool IsPipelineMsg() const {
+      return std::holds_alternative<PipelineMessagePtr>(handle);
+    }
+
+    bool IsPubMsg() const {
+      return std::holds_alternative<PubMessagePtr>(handle);
+    }
 
-    bool IsPipelineMsg() const;
-    bool IsPubMsg() const;
     bool IsReplying() const;  // control messges don't reply, messages carrying data do
 
     std::variant<MonitorMessage, PubMessagePtr, PipelineMessagePtr, MCPipelineMessagePtr,
diff --git a/src/facade/facade_types.h b/src/facade/facade_types.h
index 0b0641dd1..87cf137f6 100644
--- a/src/facade/facade_types.h
+++ b/src/facade/facade_types.h
@@ -48,13 +48,13 @@ inline std::string_view ToSV(std::string_view slice) {
 
 struct ConnectionStats {
   size_t read_buf_capacity = 0;                // total capacity of input buffers
-  size_t dispatch_queue_entries = 0;           // total number of dispatch queue entries
+  uint64_t dispatch_queue_entries = 0;         // total number of dispatch queue entries
   size_t dispatch_queue_bytes = 0;             // total size of all dispatch queue entries
   size_t dispatch_queue_subscriber_bytes = 0;  // total size of all publish messages
 
   size_t pipeline_cmd_cache_bytes = 0;
 
-  size_t io_read_cnt = 0;
+  uint64_t io_read_cnt = 0;
   size_t io_read_bytes = 0;
 
   uint64_t command_cnt = 0;
diff --git a/src/server/server_family.cc b/src/server/server_family.cc
index e5cadf57e..119020e7f 100644
--- a/src/server/server_family.cc
+++ b/src/server/server_family.cc
@@ -1071,10 +1071,17 @@ void PrintPrometheusMetrics(const Metrics& m, DflyCmd* dfly_cmd, StringResponse*
                             MetricType::GAUGE, &resp->body());
   AppendMetricWithoutLabels("blocked_clients", "", conn_stats.num_blocked_clients,
                             MetricType::GAUGE, &resp->body());
-  AppendMetricWithoutLabels("dispatch_queue_bytes", "", conn_stats.dispatch_queue_bytes,
+  AppendMetricWithoutLabels("pipeline_queue_bytes", "", conn_stats.dispatch_queue_bytes,
+                            MetricType::GAUGE, &resp->body());
+  AppendMetricWithoutLabels("pipeline_queue_length", "", conn_stats.dispatch_queue_entries,
                             MetricType::GAUGE, &resp->body());
   AppendMetricWithoutLabels("pipeline_cmd_cache_bytes", "", conn_stats.pipeline_cmd_cache_bytes,
                             MetricType::GAUGE, &resp->body());
+  AppendMetricWithoutLabels("pipeline_commands_total", "", conn_stats.pipelined_cmd_cnt,
+                            MetricType::COUNTER, &resp->body());
+  AppendMetricWithoutLabels("pipeline_commands_duration_seconds", "",
+                            conn_stats.pipelined_cmd_latency * 1e-6, MetricType::COUNTER,
+                            &resp->body());
 
   // Memory metrics
   auto sdata_res = io::ReadStatusInfo();
@@ -1977,7 +1984,7 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) {
     append("max_clients", GetFlag(FLAGS_maxclients));
     append("client_read_buffer_bytes", m.facade_stats.conn_stats.read_buf_capacity);
     append("blocked_clients", m.facade_stats.conn_stats.num_blocked_clients);
-    append("dispatch_queue_entries", m.facade_stats.conn_stats.dispatch_queue_entries);
+    append("pipeline_queue_length", m.facade_stats.conn_stats.dispatch_queue_entries);
   }
 
   if (should_enter("MEMORY")) {
diff --git a/tools/local/monitoring/docker-compose.yml b/tools/local/monitoring/docker-compose.yml
index eaf09ec0d..6501c83e5 100644
--- a/tools/local/monitoring/docker-compose.yml
+++ b/tools/local/monitoring/docker-compose.yml
@@ -6,7 +6,7 @@ volumes:
 
 services:
   prometheus:
-    image: prom/prometheus
+    image: prom/prometheus:v2.45.5
     restart: always
     volumes:
       - ./prometheus:/etc/prometheus/
diff --git a/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json b/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json
index c12d8d7b4..9c7c4d680 100644
--- a/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json
+++ b/tools/local/monitoring/grafana/provisioning/dashboards/dashboard.json
@@ -105,7 +105,7 @@
         },
         "textMode": "auto"
       },
-      "pluginVersion": "9.3.6",
+      "pluginVersion": "10.1.10",
       "targets": [
         {
           "datasource": {
@@ -191,7 +191,7 @@
         },
         "textMode": "auto"
       },
-      "pluginVersion": "9.3.6",
+      "pluginVersion": "10.1.10",
       "targets": [
         {
           "datasource": {
@@ -282,7 +282,7 @@
         "showThresholdLabels": false,
         "showThresholdMarkers": true
       },
-      "pluginVersion": "9.3.6",
+      "pluginVersion": "10.1.10",
       "targets": [
         {
           "datasource": {
@@ -1609,10 +1609,24 @@
           "fullMetaSearch": false,
           "includeNullMetadata": false,
           "instant": false,
-          "legendFormat": "__auto",
+          "legendFormat": "switch",
           "range": true,
           "refId": "A",
           "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr":
+              "rate(dragonfly_fiber_longrun_seconds[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "longrun",
+          "range": true,
+          "refId": "B"
         }
       ],
       "title": "FiberSwitchDelay",
@@ -1622,7 +1636,7 @@
     {
       "datasource": {
         "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
+        "uid": "PBFA97CFB590B2093"
       },
       "fieldConfig": {
         "defaults": {
@@ -1673,8 +1687,7 @@
                 "value": 80
               }
             ]
-          },
-          "unit": "s"
+          }
         },
         "overrides": []
       },
@@ -1684,7 +1697,7 @@
         "x": 12,
         "y": 30
       },
-      "id": 20,
+      "id": 22,
       "options": {
         "legend": {
           "calcs": [],
@@ -1703,21 +1716,15 @@
             "type": "prometheus",
             "uid": "PBFA97CFB590B2093"
           },
-          "disableTextWrap": false,
           "editorMode": "code",
-          "expr":
-              "rate(dragonfly_fiber_longrun_seconds_total[$__rate_interval])/rate(dragonfly_fiber_longrun_total[$__rate_interval])",
-          "fullMetaSearch": false,
-          "includeNullMetadata": false,
+          "expr": "dragonfly_pipeline_queue_length/dragonfly_connected_clients",
           "instant": false,
-          "legendFormat": "__auto",
+          "legendFormat": "avr_pipeline_depth",
           "range": true,
-          "refId": "A",
-          "useBackend": false
+          "refId": "A"
         }
       ],
-      "title": "FiberSwitchDelay",
-      "transformations": [],
+      "title": "Pipeline length",
       "type": "timeseries"
     },
     {
@@ -1826,6 +1833,102 @@
       ],
       "title": "Master Replication memory",
       "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "PBFA97CFB590B2093"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          },
+          "unit": "s"
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 38
+      },
+      "id": 23,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "PBFA97CFB590B2093"
+          },
+          "editorMode": "code",
+          "expr":
+              "rate(dragonfly_pipeline_commands_duration_seconds[$__rate_interval])/rate(dragonfly_pipeline_commands_total[$__rate_interval])",
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Pipeline latency",
+      "type": "timeseries"
     }
   ],
   "refresh": "1m",
@@ -1948,7 +2051,7 @@
     ]
   },
   "time": {
-    "from": "now-5m",
+    "from": "now-15m",
     "to": "now"
   },
   "timepicker": {
@@ -1979,6 +2082,6 @@
   "timezone": "browser",
   "title": "Dragonfly Dashboard",
   "uid": "xDLNRKUWz",
-  "version": 1,
+  "version": 4,
   "weekStart": ""
 }
diff --git a/tools/local/monitoring/prometheus/prometheus.yml b/tools/local/monitoring/prometheus/prometheus.yml
index 6944a1696..8c6ab22d2 100644
--- a/tools/local/monitoring/prometheus/prometheus.yml
+++ b/tools/local/monitoring/prometheus/prometheus.yml
@@ -1,8 +1,7 @@
 # my global config
 global:
-  scrape_interval:     15s # By default, scrape targets every 15 seconds.
-  evaluation_interval: 15s # By default, scrape targets every 15 seconds.
-  # scrape_timeout is set to the global default (10s).
+  scrape_interval:     5s
+  evaluation_interval: 5s
 
   # Attach these labels to any time series or alerts when communicating with
   # external systems (federation, remote storage, Alertmanager).
@@ -43,8 +42,6 @@ scrape_configs:
 
 
   - job_name: 'node-exporter'
-
-    # Override the global default and scrape targets from this job every 5 seconds.
     scrape_interval: 1s
     static_configs:
       - targets: ['node-exporter:9100']