From 93f6773297aeb90245949cff6d7a844478d77a91 Mon Sep 17 00:00:00 2001
From: Roman Gershman <roman@dragonflydb.io>
Date: Wed, 14 Aug 2024 14:45:54 +0300
Subject: [PATCH] chore: reduce pipelining latency by reusing existing shard
 fibers (#3494)

* chore: reduce pipelining latency by reusing existing shard fibers

To prove the benefits, run `./dfly_bench --pipeline=50   -n 20000  --ratio 0:1  --qps=0  --key_maximum=1`
Before: the average pipelining latency was 10ms
After: the average pipelining latency is 5ms.
Avg latency: pipelined_latency_usec / total_pipelined_squashed_commands

Also, improved counting of squashed commands - to count actual squashed ones.
---------

Signed-off-by: Roman Gershman <roman@dragonflydb.io>
---
 src/facade/dragonfly_connection.cc            |  2 +-
 src/facade/facade.cc                          |  3 +-
 src/facade/facade_types.h                     |  2 +-
 src/server/main_service.cc                    |  6 ++--
 src/server/multi_command_squasher.cc          | 22 +++++++++++-
 src/server/server_family.cc                   |  2 +-
 src/server/server_state.cc                    | 35 +++++++++++--------
 src/server/server_state.h                     |  2 +-
 .../provisioning/datasources/datasource.yml   |  1 +
 9 files changed, 51 insertions(+), 24 deletions(-)
diff --git a/src/facade/dragonfly_connection.cc b/src/facade/dragonfly_connection.cc
index 7fd18a0b1..6f6853ec1 100644
--- a/src/facade/dragonfly_connection.cc
+++ b/src/facade/dragonfly_connection.cc
@@ -1280,7 +1280,7 @@ void Connection::SquashPipeline(facade::SinkReplyBuilder* builder) {
     auto& pmsg = get<PipelineMessagePtr>(msg.handle);
     squash_cmds.push_back(absl::MakeSpan(pmsg->args));
   }
-  stats_->squashed_commands += squash_cmds.size();
+
   cc_->async_dispatch = true;
 
   size_t dispatched = service_->DispatchManyCommands(absl::MakeSpan(squash_cmds), cc_.get());
diff --git a/src/facade/facade.cc b/src/facade/facade.cc
index 01596a4b0..ed401b78d 100644
--- a/src/facade/facade.cc
+++ b/src/facade/facade.cc
@@ -20,7 +20,7 @@ constexpr size_t kSizeConnStats = sizeof(ConnectionStats);
 
 ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
   // To break this code deliberately if we add/remove a field to this struct.
-  static_assert(kSizeConnStats == 120u);
+  static_assert(kSizeConnStats == 112u);
 
   ADD(read_buf_capacity);
   ADD(dispatch_queue_entries);
@@ -37,7 +37,6 @@ ConnectionStats& ConnectionStats::operator+=(const ConnectionStats& o) {
   ADD(num_replicas);
   ADD(num_blocked_clients);
   ADD(num_migrations);
-  ADD(squashed_commands);
 
   return *this;
 }
diff --git a/src/facade/facade_types.h b/src/facade/facade_types.h
index 5b8e0eff1..2ee50b655 100644
--- a/src/facade/facade_types.h
+++ b/src/facade/facade_types.h
@@ -117,7 +117,7 @@ struct ConnectionStats {
   uint32_t num_replicas = 0;
   uint32_t num_blocked_clients = 0;
   uint64_t num_migrations = 0;
-  uint64_t squashed_commands = 0;
+
   ConnectionStats& operator+=(const ConnectionStats& o);
 };
 
diff --git a/src/server/main_service.cc b/src/server/main_service.cc
index 2d20f457d..33cf54c2b 100644
--- a/src/server/main_service.cc
+++ b/src/server/main_service.cc
@@ -1398,6 +1398,7 @@ size_t Service::DispatchManyCommands(absl::Span<CmdArgList> args_list,
   intrusive_ptr<Transaction> dist_trans;
 
   size_t dispatched = 0;
+  auto* ss = dfly::ServerState::tlocal();
 
   auto perform_squash = [&] {
     if (stored_cmds.empty())
@@ -1416,11 +1417,12 @@ size_t Service::DispatchManyCommands(absl::Span<CmdArgList> args_list,
     dfly_cntx->transaction = nullptr;
 
     dispatched += stored_cmds.size();
+    ss->stats.squashed_commands += stored_cmds.size();
     stored_cmds.clear();
   };
 
   // Don't even start when paused. We can only continue if DispatchTracker is aware of us running.
-  if (dfly::ServerState::tlocal()->IsPaused())
+  if (ss->IsPaused())
     return 0;
 
   for (auto args : args_list) {
@@ -1451,7 +1453,7 @@ size_t Service::DispatchManyCommands(absl::Span<CmdArgList> args_list,
     perform_squash();
 
     // Stop accumulating when a pause is requested, fall back to regular dispatch
-    if (dfly::ServerState::tlocal()->IsPaused())
+    if (ss->IsPaused())
       break;
 
     // Dispatch non squashed command only after all squshed commands were executed and replied
diff --git a/src/server/multi_command_squasher.cc b/src/server/multi_command_squasher.cc
index 80d518fbf..814db9ca9 100644
--- a/src/server/multi_command_squasher.cc
+++ b/src/server/multi_command_squasher.cc
@@ -191,8 +191,12 @@ bool MultiCommandSquasher::ExecuteSquashed() {
   if (order_.empty())
     return true;
 
-  for (auto& sd : sharded_)
+  unsigned num_shards = 0;
+  for (auto& sd : sharded_) {
     sd.replies.reserve(sd.cmds.size());
+    if (!sd.cmds.empty())
+      ++num_shards;
+  }
 
   Transaction* tx = cntx_->transaction;
   ServerState::tlocal()->stats.multi_squash_executions++;
@@ -207,8 +211,24 @@ bool MultiCommandSquasher::ExecuteSquashed() {
     tx->PrepareSquashedMultiHop(base_cid_, cb);
     tx->ScheduleSingleHop([this](auto* tx, auto* es) { return SquashedHopCb(tx, es); });
   } else {
+#if 1
+    fb2::BlockingCounter bc(num_shards);
+    DVLOG(1) << "Squashing " << num_shards << " " << tx->DebugId();
+
+    auto cb = [this, tx, bc]() mutable {
+      this->SquashedHopCb(tx, EngineShard::tlocal());
+      bc->Dec();
+    };
+
+    for (unsigned i = 0; i < sharded_.size(); ++i) {
+      if (!sharded_[i].cmds.empty())
+        shard_set->AddL2(i, cb);
+    }
+    bc->Wait();
+#else
     shard_set->RunBlockingInParallel([this, tx](auto* es) { SquashedHopCb(tx, es); },
                                      [this](auto sid) { return !sharded_[sid].cmds.empty(); });
+#endif
   }
 
   uint64_t after_hop = proactor->GetMonotonicTimeNs();
diff --git a/src/server/server_family.cc b/src/server/server_family.cc
index 5ae91b35f..c1e79693b 100644
--- a/src/server/server_family.cc
+++ b/src/server/server_family.cc
@@ -2154,7 +2154,7 @@ void ServerFamily::Info(CmdArgList args, ConnectionContext* cntx) {
     append("total_commands_processed", conn_stats.command_cnt);
     append("instantaneous_ops_per_sec", m.qps);
     append("total_pipelined_commands", conn_stats.pipelined_cmd_cnt);
-    append("total_pipelined_squashed_commands", conn_stats.squashed_commands);
+    append("total_pipelined_squashed_commands", m.coordinator_stats.squashed_commands);
     append("pipelined_latency_usec", conn_stats.pipelined_cmd_latency);
     append("total_net_input_bytes", conn_stats.io_read_bytes);
     append("connection_migrations", conn_stats.num_migrations);
diff --git a/src/server/server_state.cc b/src/server/server_state.cc
index 9039176fb..0bded3d31 100644
--- a/src/server/server_state.cc
+++ b/src/server/server_state.cc
@@ -27,25 +27,29 @@ ServerState::Stats::Stats(unsigned num_shards) : tx_width_freq_arr(num_shards) {
 }
 
 ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
-  static_assert(sizeof(Stats) == 16 * 8, "Stats size mismatch");
+  static_assert(sizeof(Stats) == 17 * 8, "Stats size mismatch");
 
-  this->eval_io_coordination_cnt += other.eval_io_coordination_cnt;
-  this->eval_shardlocal_coordination_cnt += other.eval_shardlocal_coordination_cnt;
-  this->eval_squashed_flushes += other.eval_squashed_flushes;
+#define ADD(x) this->x += (other.x)
 
-  this->tx_global_cnt += other.tx_global_cnt;
-  this->tx_normal_cnt += other.tx_normal_cnt;
-  this->tx_inline_runs += other.tx_inline_runs;
-  this->tx_schedule_cancel_cnt += other.tx_schedule_cancel_cnt;
+  ADD(eval_io_coordination_cnt);
 
-  this->multi_squash_executions += other.multi_squash_executions;
-  this->multi_squash_exec_hop_usec += other.multi_squash_exec_hop_usec;
-  this->multi_squash_exec_reply_usec += other.multi_squash_exec_reply_usec;
+  ADD(eval_shardlocal_coordination_cnt);
+  ADD(eval_squashed_flushes);
 
-  this->blocked_on_interpreter += other.blocked_on_interpreter;
-  this->rdb_save_usec += other.rdb_save_usec;
-  this->rdb_save_count += other.rdb_save_count;
-  this->oom_error_cmd_cnt += other.oom_error_cmd_cnt;
+  ADD(tx_global_cnt);
+  ADD(tx_normal_cnt);
+  ADD(tx_inline_runs);
+  ADD(tx_schedule_cancel_cnt);
+
+  ADD(multi_squash_executions);
+  ADD(multi_squash_exec_hop_usec);
+  ADD(multi_squash_exec_reply_usec);
+  ADD(squashed_commands);
+
+  ADD(blocked_on_interpreter);
+  ADD(rdb_save_usec);
+  ADD(rdb_save_count);
+  ADD(oom_error_cmd_cnt);
 
   if (this->tx_width_freq_arr.size() > 0) {
     DCHECK_EQ(this->tx_width_freq_arr.size(), other.tx_width_freq_arr.size());
@@ -54,6 +58,7 @@ ServerState::Stats& ServerState::Stats::Add(const ServerState::Stats& other) {
     this->tx_width_freq_arr = other.tx_width_freq_arr;
   }
   return *this;
+#undef ADD
 }
 
 void MonitorsRepo::Add(facade::Connection* connection) {
diff --git a/src/server/server_state.h b/src/server/server_state.h
index bd0a25583..ecb392e74 100644
--- a/src/server/server_state.h
+++ b/src/server/server_state.h
@@ -116,7 +116,7 @@ class ServerState {  // public struct - to allow initialization.
     uint64_t multi_squash_executions = 0;
     uint64_t multi_squash_exec_hop_usec = 0;
     uint64_t multi_squash_exec_reply_usec = 0;
-
+    uint64_t squashed_commands = 0;
     uint64_t blocked_on_interpreter = 0;
 
     uint64_t rdb_save_usec = 0;
diff --git a/tools/local/monitoring/grafana/provisioning/datasources/datasource.yml b/tools/local/monitoring/grafana/provisioning/datasources/datasource.yml
index c02bb38b3..a9c404c9d 100644
--- a/tools/local/monitoring/grafana/provisioning/datasources/datasource.yml
+++ b/tools/local/monitoring/grafana/provisioning/datasources/datasource.yml
@@ -40,6 +40,7 @@ datasources:
      graphiteVersion: "1.1"
      tlsAuth: false
      tlsAuthWithCACert: false
+     timeInterval: 1s  # Based on https://stackoverflow.com/a/66830690
   # <string> json object of data that will be encrypted.
   secureJsonData:
     tlsCACert: "..."