From 01a4dcd9caca469b89e0f28ffd196ef1e3847d88 Mon Sep 17 00:00:00 2001
From: winniebyte <winifredchiokoye@gmail.com>
Date: Sun, 28 Jun 2026 09:18:03 +0000
Subject: [PATCH] feat(observability): add db query duration histogram and pool
 exhaustion alert

---
 performance/config/alerts.yaml                | 11 +++
 performance/config/grafana-dashboard.json     | 51 ++++++++++++++
 performance/config/grafana-slo-dashboard.json | 42 ++++++++++++
 performance/config/prometheus-slo-rules.yml   | 32 +++++++++
 services/api/src/db.rs                        | 17 ++++-
 services/api/src/metrics.rs                   | 67 +++++++++++++++++++
 6 files changed, 218 insertions(+), 2 deletions(-)

diff --git a/performance/config/alerts.yaml b/performance/config/alerts.yaml
index 778ff282..2871b34d 100644
--- a/performance/config/alerts.yaml
+++ b/performance/config/alerts.yaml
@@ -88,6 +88,17 @@ groups:
           description: "Database p99 query time is {{ $value }}s, exceeding 100ms critical threshold"
           runbook_url: "https://docs.predictiq.com/runbooks/critical-database-queries"
 
+      - alert: DBPoolExhaustion
+        expr: rate(db_pool_exhaustion_total[5m]) > 0.0167
+        for: 1m
+        labels:
+          severity: critical
+          component: database
+        annotations:
+          summary: "Database connection pool exhaustion detected"
+          description: "Pool exhaustion rate is {{ $value }} per second, exceeding 1/minute threshold"
+          runbook_url: "https://docs.predictiq.com/runbooks/db-pool-exhaustion"
+
       - alert: HighDBConnectionPoolUtilization
         expr: db_connections_active / db_connections_max > 0.8
         for: 5m
diff --git a/performance/config/grafana-dashboard.json b/performance/config/grafana-dashboard.json
index 3810f27c..ddbeef8e 100644
--- a/performance/config/grafana-dashboard.json
+++ b/performance/config/grafana-dashboard.json
@@ -484,6 +484,57 @@
           }
         }
       }
+      },
+      {
+        "id": 10,
+        "title": "DB Pool Utilisation (% of max_connections)",
+        "type": "graph",
+        "gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 },
+        "targets": [
+          {
+            "expr": "db_pool_connections_active{pool=\"api\"} / (db_pool_connections_active{pool=\"api\"} + db_pool_connections_idle{pool=\"api\"}) * 100",
+            "legendFormat": "Pool Utilisation %",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {
+            "format": "percent",
+            "label": "Utilisation"
+          }
+        ],
+        "alert": {
+          "name": "High DB Pool Utilisation",
+          "conditions": [
+            {
+              "evaluator": {
+                "params": [80],
+                "type": "gt"
+              },
+              "operator": {
+                "type": "and"
+              },
+              "query": {
+                "params": ["A", "5m", "now"]
+              },
+              "reducer": {
+                "params": [],
+                "type": "avg"
+              },
+              "type": "query"
+            }
+          ],
+          "executionErrorState": "alerting",
+          "frequency": "1m",
+          "handler": 1,
+          "message": "DB pool utilisation exceeded 80%",
+          "noDataState": "no_data",
+          "notifications": []
+        }
+      }
+    ]
+  }
+}
     ]
   }
 }
diff --git a/performance/config/grafana-slo-dashboard.json b/performance/config/grafana-slo-dashboard.json
index a6e8e52a..2a354d08 100644
--- a/performance/config/grafana-slo-dashboard.json
+++ b/performance/config/grafana-slo-dashboard.json
@@ -242,6 +242,42 @@
         "type": "text",
         "content": "## Error Budget Policy\n\n- **100% remaining**: Normal operations\n- **50% remaining**: Review recent changes\n- **25% remaining**: Freeze non-critical deployments\n- **10% remaining**: Freeze all deployments\n- **0% remaining**: Emergency - rollback and incident response\n\n### Burn Rate Thresholds\n\n- **Fast Burn (14.4x)**: Exhausts budget in 2 days → Critical alert\n- **Slow Burn (6.0x)**: Exhausts budget in 5 days → Warning alert"
       }
+    ],
+      },
+      {
+        "id": 11,
+        "title": "Database Query Latency P50/P95/P99",
+        "type": "graph",
+        "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.50, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000",
+            "legendFormat": "P50 Latency",
+            "refId": "A"
+          },
+          {
+            "expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000",
+            "legendFormat": "P95 Latency",
+            "refId": "B"
+          },
+          {
+            "expr": "histogram_quantile(0.99, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000",
+            "legendFormat": "P99 Latency",
+            "refId": "C"
+          },
+          {
+            "expr": "500",
+            "legendFormat": "P95 Alert Threshold",
+            "refId": "D"
+          }
+        ],
+        "yaxes": [
+          {
+            "label": "Latency (ms)",
+            "format": "ms"
+          }
+        ]
+      }
     ],
     "time": {
       "from": "now-7d",
@@ -249,3 +285,9 @@
     }
   }
 }
+    "time": {
+      "from": "now-7d",
+      "to": "now"
+    }
+  }
+}
\ No newline at end of file
diff --git a/performance/config/prometheus-slo-rules.yml b/performance/config/prometheus-slo-rules.yml
index 6c79e027..f2b1ef78 100644
--- a/performance/config/prometheus-slo-rules.yml
+++ b/performance/config/prometheus-slo-rules.yml
@@ -39,6 +39,11 @@ groups:
             sum(rate(db_query_duration_seconds_bucket[5m])) by (le)
           ) <= 0.05
 
+      # Database Pool Exhaustion Rate SLO
+      - record: slo:db_pool_exhaustion:burn_rate
+        expr: |
+          rate(db_pool_exhaustion_total[5m])
+
       # Cache Availability SLO
       - record: slo:cache_availability:success_rate
         expr: |
@@ -171,3 +176,30 @@ groups:
           summary: "Cache availability SLO violation"
           description: "Redis cache availability below 99.95% target. Current: {{ $value }}%"
           runbook_url: "https://docs.predictiq.com/runbooks/cache-availability-slo-violation"
+
+      # Database Query P95 Latency Alert
+      - alert: DBSlowQueryP95
+        expr: |
+          histogram_quantile(0.95,
+            sum(rate(db_query_duration_seconds_bucket[5m])) by (le)
+          ) > 0.5
+        for: 5m
+        labels:
+          severity: warning
+          slo: db_query_latency
+        annotations:
+          summary: "Database P95 query latency exceeds 500ms"
+          description: "Database P95 query latency is {{ $value }}s, exceeding 500ms threshold for 5 minutes"
+          runbook_url: "https://docs.predictiq.com/runbooks/slow-database-queries"
+
+      # Database Connection Pool Exhaustion Alert
+      - alert: DBPoolExhaustion
+        expr: rate(db_pool_exhaustion_total[5m]) > 0.0167
+        for: 1m
+        labels:
+          severity: critical
+          slo: db_pool_availability
+        annotations:
+          summary: "Database connection pool exhaustion rate exceeds 1/min"
+          description: "Pool exhaustion rate is {{ $value | humanize }} per second (threshold: 0.0167/s = 1/min)"
+          runbook_url: "https://docs.predictiq.com/runbooks/db-pool-exhaustion"
diff --git a/services/api/src/db.rs b/services/api/src/db.rs
index d1e35406..fc91272b 100644
--- a/services/api/src/db.rs
+++ b/services/api/src/db.rs
@@ -173,14 +173,27 @@ impl Database {
     }
 
     /// Run `fut` with the configured query timeout.
+    /// On success, records the query duration in the `db_query_duration_seconds` histogram.
     /// On timeout, increments the `db_timeouts` metric and logs a warning.
+    /// On pool exhaustion, increments the `db_pool_exhaustion_total` counter.
     async fn with_timeout<F, T>(&self, operation: &str, fut: F) -> Result<T, DbError>
     where
         F: std::future::Future<Output = Result<T, sqlx::Error>>,
     {
+        let start = std::time::Instant::now();
         match tokio::time::timeout(self.query_timeout, fut).await {
-            Ok(Ok(v)) => Ok(v),
-            Ok(Err(e)) => Err(DbError::Other(anyhow::Error::from(e))),
+            Ok(Ok(v)) => {
+                self.metrics
+                    .observe_db_query_duration(operation, start.elapsed());
+                Ok(v)
+            }
+            Ok(Err(e)) => {
+                if matches!(&e, sqlx::Error::PoolTimedOut) {
+                    self.metrics.observe_db_pool_exhaustion("api");
+                    return Err(DbError::PoolExhausted);
+                }
+                Err(DbError::Other(anyhow::Error::from(e)))
+            }
             Err(_elapsed) => {
                 self.metrics.observe_db_timeout(operation);
                 tracing::warn!(operation, timeout_secs = ?self.query_timeout, "db query timed out");
diff --git a/services/api/src/metrics.rs b/services/api/src/metrics.rs
index e9c8b542..75f8317b 100644
--- a/services/api/src/metrics.rs
+++ b/services/api/src/metrics.rs
@@ -12,7 +12,9 @@ pub struct Metrics {
     request_latency: HistogramVec,
     rpc_errors: IntCounterVec,
     rpc_fallbacks: IntCounterVec,
+    db_query_duration: HistogramVec,
     db_timeouts: IntCounterVec,
+    db_pool_exhaustion: IntCounterVec,
     email_dlq_size: IntGauge,
     db_pool_connections_active: IntGaugeVec,
     db_pool_connections_idle: IntGaugeVec,
@@ -69,12 +71,33 @@ impl Metrics {
         )
         .context("rpc_fallbacks metric")?;
 
+        let db_query_duration = HistogramVec::new(
+            prometheus::HistogramOpts::new(
+                "db_query_duration_seconds",
+                "Database query duration in seconds by query name",
+            )
+            .buckets(vec![
+                0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0,
+            ]),
+            &["query_name"],
+        )
+        .context("db_query_duration metric")?;
+
         let db_timeouts = IntCounterVec::new(
             prometheus::Opts::new("db_timeouts_total", "DB queries that exceeded the timeout, by operation"),
             &["operation"],
         )
         .context("db_timeouts metric")?;
 
+        let db_pool_exhaustion = IntCounterVec::new(
+            prometheus::Opts::new(
+                "db_pool_exhaustion_total",
+                "Number of times the connection pool was exhausted, by pool name",
+            ),
+            &["pool"],
+        )
+        .context("db_pool_exhaustion metric")?;
+
         let email_dlq_size = IntGauge::new(
             "email_dlq_size",
             "Number of email jobs currently in the dead-letter queue",
@@ -126,7 +149,9 @@ impl Metrics {
         registry.register(Box::new(request_latency.clone()))?;
         registry.register(Box::new(rpc_errors.clone()))?;
         registry.register(Box::new(rpc_fallbacks.clone()))?;
+        registry.register(Box::new(db_query_duration.clone()))?;
         registry.register(Box::new(db_timeouts.clone()))?;
+        registry.register(Box::new(db_pool_exhaustion.clone()))?;
         registry.register(Box::new(email_dlq_size.clone()))?;
         registry.register(Box::new(db_pool_connections_active.clone()))?;
         registry.register(Box::new(db_pool_connections_idle.clone()))?;
@@ -141,7 +166,9 @@ impl Metrics {
             request_latency,
             rpc_errors,
             rpc_fallbacks,
+            db_query_duration,
             db_timeouts,
+            db_pool_exhaustion,
             email_dlq_size,
             db_pool_connections_active,
             db_pool_connections_idle,
@@ -183,10 +210,22 @@ impl Metrics {
         self.rpc_fallbacks.with_label_values(&[endpoint]).inc();
     }
 
+    pub fn observe_db_query_duration(&self, query_name: &str, duration: Duration) {
+        self.db_query_duration
+            .with_label_values(&[query_name])
+            .observe(duration.as_secs_f64());
+    }
+
     pub fn observe_db_timeout(&self, operation: &str) {
         self.db_timeouts.with_label_values(&[operation]).inc();
     }
 
+    pub fn observe_db_pool_exhaustion(&self, pool: &str) {
+        self.db_pool_exhaustion
+            .with_label_values(&[pool])
+            .inc();
+    }
+
     pub fn set_dlq_size(&self, n: i64) {
         self.email_dlq_size.set(n);
     }
@@ -234,3 +273,31 @@ impl Metrics {
         Ok(String::from_utf8(buffer)?)
     }
 }
+
+        Ok(String::from_utf8(buffer)?)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn observe_db_query_duration_records_histogram() {
+        let metrics = Metrics::new().unwrap();
+        metrics.observe_db_query_duration("test_query", Duration::from_millis(100));
+        let output = metrics.render().unwrap();
+        assert!(output.contains("db_query_duration_seconds"));
+        assert!(output.contains("query_name=\"test_query\""));
+    }
+
+    #[test]
+    fn observe_db_pool_exhaustion_increments_counter() {
+        let metrics = Metrics::new().unwrap();
+        metrics.observe_db_pool_exhaustion("api");
+        let output = metrics.render().unwrap();
+        assert!(output.contains("db_pool_exhaustion_total"));
+        assert!(output.contains("pool=\"api\""));
+        assert!(output.contains("1"));
+    }
+}
\ No newline at end of file