From 01a4dcd9caca469b89e0f28ffd196ef1e3847d88 Mon Sep 17 00:00:00 2001 From: winniebyte Date: Sun, 28 Jun 2026 09:18:03 +0000 Subject: [PATCH] feat(observability): add db query duration histogram and pool exhaustion alert --- performance/config/alerts.yaml | 11 +++ performance/config/grafana-dashboard.json | 51 ++++++++++++++ performance/config/grafana-slo-dashboard.json | 42 ++++++++++++ performance/config/prometheus-slo-rules.yml | 32 +++++++++ services/api/src/db.rs | 17 ++++- services/api/src/metrics.rs | 67 +++++++++++++++++++ 6 files changed, 218 insertions(+), 2 deletions(-) diff --git a/performance/config/alerts.yaml b/performance/config/alerts.yaml index 778ff282..2871b34d 100644 --- a/performance/config/alerts.yaml +++ b/performance/config/alerts.yaml @@ -88,6 +88,17 @@ groups: description: "Database p99 query time is {{ $value }}s, exceeding 100ms critical threshold" runbook_url: "https://docs.predictiq.com/runbooks/critical-database-queries" + - alert: DBPoolExhaustion + expr: rate(db_pool_exhaustion_total[5m]) > 0.0167 + for: 1m + labels: + severity: critical + component: database + annotations: + summary: "Database connection pool exhaustion detected" + description: "Pool exhaustion rate is {{ $value }} per second, exceeding 1/minute threshold" + runbook_url: "https://docs.predictiq.com/runbooks/db-pool-exhaustion" + - alert: HighDBConnectionPoolUtilization expr: db_connections_active / db_connections_max > 0.8 for: 5m diff --git a/performance/config/grafana-dashboard.json b/performance/config/grafana-dashboard.json index 3810f27c..ddbeef8e 100644 --- a/performance/config/grafana-dashboard.json +++ b/performance/config/grafana-dashboard.json @@ -484,6 +484,57 @@ } } } + }, + { + "id": 10, + "title": "DB Pool Utilisation (% of max_connections)", + "type": "graph", + "gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 }, + "targets": [ + { + "expr": "db_pool_connections_active{pool=\"api\"} / (db_pool_connections_active{pool=\"api\"} + db_pool_connections_idle{pool=\"api\"}) * 100", + "legendFormat": "Pool Utilisation %", + "refId": "A" + } + ], + "yaxes": [ + { + "format": "percent", + "label": "Utilisation" + } + ], + "alert": { + "name": "High DB Pool Utilisation", + "conditions": [ + { + "evaluator": { + "params": [80], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": ["A", "5m", "now"] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "1m", + "handler": 1, + "message": "DB pool utilisation exceeded 80%", + "noDataState": "no_data", + "notifications": [] + } + } + ] + } +} ] } } diff --git a/performance/config/grafana-slo-dashboard.json b/performance/config/grafana-slo-dashboard.json index a6e8e52a..2a354d08 100644 --- a/performance/config/grafana-slo-dashboard.json +++ b/performance/config/grafana-slo-dashboard.json @@ -242,6 +242,42 @@ "type": "text", "content": "## Error Budget Policy\n\n- **100% remaining**: Normal operations\n- **50% remaining**: Review recent changes\n- **25% remaining**: Freeze non-critical deployments\n- **10% remaining**: Freeze all deployments\n- **0% remaining**: Emergency - rollback and incident response\n\n### Burn Rate Thresholds\n\n- **Fast Burn (14.4x)**: Exhausts budget in 2 days → Critical alert\n- **Slow Burn (6.0x)**: Exhausts budget in 5 days → Warning alert" } + ], + }, + { + "id": 11, + "title": "Database Query Latency P50/P95/P99", + "type": "graph", + "gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "P50 Latency", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "P95 Latency", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000", + "legendFormat": "P99 Latency", + "refId": "C" + }, + { + "expr": "500", + "legendFormat": "P95 Alert Threshold", + "refId": "D" + } + ], + "yaxes": [ + { + "label": "Latency (ms)", + "format": "ms" + } + ] + } ], "time": { "from": "now-7d", @@ -249,3 +285,9 @@ } } } + "time": { + "from": "now-7d", + "to": "now" + } + } +} \ No newline at end of file diff --git a/performance/config/prometheus-slo-rules.yml b/performance/config/prometheus-slo-rules.yml index 6c79e027..f2b1ef78 100644 --- a/performance/config/prometheus-slo-rules.yml +++ b/performance/config/prometheus-slo-rules.yml @@ -39,6 +39,11 @@ groups: sum(rate(db_query_duration_seconds_bucket[5m])) by (le) ) <= 0.05 + # Database Pool Exhaustion Rate SLO + - record: slo:db_pool_exhaustion:burn_rate + expr: | + rate(db_pool_exhaustion_total[5m]) + # Cache Availability SLO - record: slo:cache_availability:success_rate expr: | @@ -171,3 +176,30 @@ groups: summary: "Cache availability SLO violation" description: "Redis cache availability below 99.95% target. Current: {{ $value }}%" runbook_url: "https://docs.predictiq.com/runbooks/cache-availability-slo-violation" + + # Database Query P95 Latency Alert + - alert: DBSlowQueryP95 + expr: | + histogram_quantile(0.95, + sum(rate(db_query_duration_seconds_bucket[5m])) by (le) + ) > 0.5 + for: 5m + labels: + severity: warning + slo: db_query_latency + annotations: + summary: "Database P95 query latency exceeds 500ms" + description: "Database P95 query latency is {{ $value }}s, exceeding 500ms threshold for 5 minutes" + runbook_url: "https://docs.predictiq.com/runbooks/slow-database-queries" + + # Database Connection Pool Exhaustion Alert + - alert: DBPoolExhaustion + expr: rate(db_pool_exhaustion_total[5m]) > 0.0167 + for: 1m + labels: + severity: critical + slo: db_pool_availability + annotations: + summary: "Database connection pool exhaustion rate exceeds 1/min" + description: "Pool exhaustion rate is {{ $value | humanize }} per second (threshold: 0.0167/s = 1/min)" + runbook_url: "https://docs.predictiq.com/runbooks/db-pool-exhaustion" diff --git a/services/api/src/db.rs b/services/api/src/db.rs index d1e35406..fc91272b 100644 --- a/services/api/src/db.rs +++ b/services/api/src/db.rs @@ -173,14 +173,27 @@ impl Database { } /// Run `fut` with the configured query timeout. + /// On success, records the query duration in the `db_query_duration_seconds` histogram. /// On timeout, increments the `db_timeouts` metric and logs a warning. + /// On pool exhaustion, increments the `db_pool_exhaustion_total` counter. async fn with_timeout(&self, operation: &str, fut: F) -> Result where F: std::future::Future>, { + let start = std::time::Instant::now(); match tokio::time::timeout(self.query_timeout, fut).await { - Ok(Ok(v)) => Ok(v), - Ok(Err(e)) => Err(DbError::Other(anyhow::Error::from(e))), + Ok(Ok(v)) => { + self.metrics + .observe_db_query_duration(operation, start.elapsed()); + Ok(v) + } + Ok(Err(e)) => { + if matches!(&e, sqlx::Error::PoolTimedOut) { + self.metrics.observe_db_pool_exhaustion("api"); + return Err(DbError::PoolExhausted); + } + Err(DbError::Other(anyhow::Error::from(e))) + } Err(_elapsed) => { self.metrics.observe_db_timeout(operation); tracing::warn!(operation, timeout_secs = ?self.query_timeout, "db query timed out"); diff --git a/services/api/src/metrics.rs b/services/api/src/metrics.rs index e9c8b542..75f8317b 100644 --- a/services/api/src/metrics.rs +++ b/services/api/src/metrics.rs @@ -12,7 +12,9 @@ pub struct Metrics { request_latency: HistogramVec, rpc_errors: IntCounterVec, rpc_fallbacks: IntCounterVec, + db_query_duration: HistogramVec, db_timeouts: IntCounterVec, + db_pool_exhaustion: IntCounterVec, email_dlq_size: IntGauge, db_pool_connections_active: IntGaugeVec, db_pool_connections_idle: IntGaugeVec, @@ -69,12 +71,33 @@ impl Metrics { ) .context("rpc_fallbacks metric")?; + let db_query_duration = HistogramVec::new( + prometheus::HistogramOpts::new( + "db_query_duration_seconds", + "Database query duration in seconds by query name", + ) + .buckets(vec![ + 0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, + ]), + &["query_name"], + ) + .context("db_query_duration metric")?; + let db_timeouts = IntCounterVec::new( prometheus::Opts::new("db_timeouts_total", "DB queries that exceeded the timeout, by operation"), &["operation"], ) .context("db_timeouts metric")?; + let db_pool_exhaustion = IntCounterVec::new( + prometheus::Opts::new( + "db_pool_exhaustion_total", + "Number of times the connection pool was exhausted, by pool name", + ), + &["pool"], + ) + .context("db_pool_exhaustion metric")?; + let email_dlq_size = IntGauge::new( "email_dlq_size", "Number of email jobs currently in the dead-letter queue", @@ -126,7 +149,9 @@ impl Metrics { registry.register(Box::new(request_latency.clone()))?; registry.register(Box::new(rpc_errors.clone()))?; registry.register(Box::new(rpc_fallbacks.clone()))?; + registry.register(Box::new(db_query_duration.clone()))?; registry.register(Box::new(db_timeouts.clone()))?; + registry.register(Box::new(db_pool_exhaustion.clone()))?; registry.register(Box::new(email_dlq_size.clone()))?; registry.register(Box::new(db_pool_connections_active.clone()))?; registry.register(Box::new(db_pool_connections_idle.clone()))?; @@ -141,7 +166,9 @@ impl Metrics { request_latency, rpc_errors, rpc_fallbacks, + db_query_duration, db_timeouts, + db_pool_exhaustion, email_dlq_size, db_pool_connections_active, db_pool_connections_idle, @@ -183,10 +210,22 @@ impl Metrics { self.rpc_fallbacks.with_label_values(&[endpoint]).inc(); } + pub fn observe_db_query_duration(&self, query_name: &str, duration: Duration) { + self.db_query_duration + .with_label_values(&[query_name]) + .observe(duration.as_secs_f64()); + } + pub fn observe_db_timeout(&self, operation: &str) { self.db_timeouts.with_label_values(&[operation]).inc(); } + pub fn observe_db_pool_exhaustion(&self, pool: &str) { + self.db_pool_exhaustion + .with_label_values(&[pool]) + .inc(); + } + pub fn set_dlq_size(&self, n: i64) { self.email_dlq_size.set(n); } @@ -234,3 +273,31 @@ impl Metrics { Ok(String::from_utf8(buffer)?) } } + + Ok(String::from_utf8(buffer)?) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn observe_db_query_duration_records_histogram() { + let metrics = Metrics::new().unwrap(); + metrics.observe_db_query_duration("test_query", Duration::from_millis(100)); + let output = metrics.render().unwrap(); + assert!(output.contains("db_query_duration_seconds")); + assert!(output.contains("query_name=\"test_query\"")); + } + + #[test] + fn observe_db_pool_exhaustion_increments_counter() { + let metrics = Metrics::new().unwrap(); + metrics.observe_db_pool_exhaustion("api"); + let output = metrics.render().unwrap(); + assert!(output.contains("db_pool_exhaustion_total")); + assert!(output.contains("pool=\"api\"")); + assert!(output.contains("1")); + } +} \ No newline at end of file