Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions performance/config/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,17 @@ groups:
description: "Database p99 query time is {{ $value }}s, exceeding 100ms critical threshold"
runbook_url: "https://docs.predictiq.com/runbooks/critical-database-queries"

- alert: DBPoolExhaustion
expr: rate(db_pool_exhaustion_total[5m]) > 0.0167
for: 1m
labels:
severity: critical
component: database
annotations:
summary: "Database connection pool exhaustion detected"
description: "Pool exhaustion rate is {{ $value }} per second, exceeding 1/minute threshold"
runbook_url: "https://docs.predictiq.com/runbooks/db-pool-exhaustion"

- alert: HighDBConnectionPoolUtilization
expr: db_connections_active / db_connections_max > 0.8
for: 5m
Expand Down
51 changes: 51 additions & 0 deletions performance/config/grafana-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,57 @@
}
}
}
},
{
"id": 10,
"title": "DB Pool Utilisation (% of max_connections)",
"type": "graph",
"gridPos": { "x": 12, "y": 24, "w": 12, "h": 8 },
"targets": [
{
"expr": "db_pool_connections_active{pool=\"api\"} / (db_pool_connections_active{pool=\"api\"} + db_pool_connections_idle{pool=\"api\"}) * 100",
"legendFormat": "Pool Utilisation %",
"refId": "A"
}
],
"yaxes": [
{
"format": "percent",
"label": "Utilisation"
}
],
"alert": {
"name": "High DB Pool Utilisation",
"conditions": [
{
"evaluator": {
"params": [80],
"type": "gt"
},
"operator": {
"type": "and"
},
"query": {
"params": ["A", "5m", "now"]
},
"reducer": {
"params": [],
"type": "avg"
},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "1m",
"handler": 1,
"message": "DB pool utilisation exceeded 80%",
"noDataState": "no_data",
"notifications": []
}
}
]
}
}
]
}
}
42 changes: 42 additions & 0 deletions performance/config/grafana-slo-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,52 @@
"type": "text",
"content": "## Error Budget Policy\n\n- **100% remaining**: Normal operations\n- **50% remaining**: Review recent changes\n- **25% remaining**: Freeze non-critical deployments\n- **10% remaining**: Freeze all deployments\n- **0% remaining**: Emergency - rollback and incident response\n\n### Burn Rate Thresholds\n\n- **Fast Burn (14.4x)**: Exhausts budget in 2 days → Critical alert\n- **Slow Burn (6.0x)**: Exhausts budget in 5 days → Warning alert"
}
],
},
{
"id": 11,
"title": "Database Query Latency P50/P95/P99",
"type": "graph",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "P50 Latency",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "P95 Latency",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(db_query_duration_seconds_bucket[5m])) by (le)) * 1000",
"legendFormat": "P99 Latency",
"refId": "C"
},
{
"expr": "500",
"legendFormat": "P95 Alert Threshold",
"refId": "D"
}
],
"yaxes": [
{
"label": "Latency (ms)",
"format": "ms"
}
]
}
],
"time": {
"from": "now-7d",
"to": "now"
}
}
}
"time": {
"from": "now-7d",
"to": "now"
}
}
}
32 changes: 32 additions & 0 deletions performance/config/prometheus-slo-rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ groups:
sum(rate(db_query_duration_seconds_bucket[5m])) by (le)
) <= 0.05

# Database Pool Exhaustion Rate SLO
- record: slo:db_pool_exhaustion:burn_rate
expr: |
rate(db_pool_exhaustion_total[5m])

# Cache Availability SLO
- record: slo:cache_availability:success_rate
expr: |
Expand Down Expand Up @@ -171,3 +176,30 @@ groups:
summary: "Cache availability SLO violation"
description: "Redis cache availability below 99.95% target. Current: {{ $value }}%"
runbook_url: "https://docs.predictiq.com/runbooks/cache-availability-slo-violation"

# Database Query P95 Latency Alert
- alert: DBSlowQueryP95
expr: |
histogram_quantile(0.95,
sum(rate(db_query_duration_seconds_bucket[5m])) by (le)
) > 0.5
for: 5m
labels:
severity: warning
slo: db_query_latency
annotations:
summary: "Database P95 query latency exceeds 500ms"
description: "Database P95 query latency is {{ $value }}s, exceeding 500ms threshold for 5 minutes"
runbook_url: "https://docs.predictiq.com/runbooks/slow-database-queries"

# Database Connection Pool Exhaustion Alert
- alert: DBPoolExhaustion
expr: rate(db_pool_exhaustion_total[5m]) > 0.0167
for: 1m
labels:
severity: critical
slo: db_pool_availability
annotations:
summary: "Database connection pool exhaustion rate exceeds 1/min"
description: "Pool exhaustion rate is {{ $value | humanize }} per second (threshold: 0.0167/s = 1/min)"
runbook_url: "https://docs.predictiq.com/runbooks/db-pool-exhaustion"
17 changes: 15 additions & 2 deletions services/api/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,27 @@ impl Database {
}

/// Run `fut` with the configured query timeout.
/// On success, records the query duration in the `db_query_duration_seconds` histogram.
/// On timeout, increments the `db_timeouts` metric and logs a warning.
/// On pool exhaustion, increments the `db_pool_exhaustion_total` counter.
async fn with_timeout<F, T>(&self, operation: &str, fut: F) -> Result<T, DbError>
where
F: std::future::Future<Output = Result<T, sqlx::Error>>,
{
let start = std::time::Instant::now();
match tokio::time::timeout(self.query_timeout, fut).await {
Ok(Ok(v)) => Ok(v),
Ok(Err(e)) => Err(DbError::Other(anyhow::Error::from(e))),
Ok(Ok(v)) => {
self.metrics
.observe_db_query_duration(operation, start.elapsed());
Ok(v)
}
Ok(Err(e)) => {
if matches!(&e, sqlx::Error::PoolTimedOut) {
self.metrics.observe_db_pool_exhaustion("api");
return Err(DbError::PoolExhausted);
}
Err(DbError::Other(anyhow::Error::from(e)))
}
Err(_elapsed) => {
self.metrics.observe_db_timeout(operation);
tracing::warn!(operation, timeout_secs = ?self.query_timeout, "db query timed out");
Expand Down
67 changes: 67 additions & 0 deletions services/api/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ pub struct Metrics {
request_latency: HistogramVec,
rpc_errors: IntCounterVec,
rpc_fallbacks: IntCounterVec,
db_query_duration: HistogramVec,
db_timeouts: IntCounterVec,
db_pool_exhaustion: IntCounterVec,
email_dlq_size: IntGauge,
db_pool_connections_active: IntGaugeVec,
db_pool_connections_idle: IntGaugeVec,
Expand Down Expand Up @@ -69,12 +71,33 @@ impl Metrics {
)
.context("rpc_fallbacks metric")?;

let db_query_duration = HistogramVec::new(
prometheus::HistogramOpts::new(
"db_query_duration_seconds",
"Database query duration in seconds by query name",
)
.buckets(vec![
0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0,
]),
&["query_name"],
)
.context("db_query_duration metric")?;

let db_timeouts = IntCounterVec::new(
prometheus::Opts::new("db_timeouts_total", "DB queries that exceeded the timeout, by operation"),
&["operation"],
)
.context("db_timeouts metric")?;

let db_pool_exhaustion = IntCounterVec::new(
prometheus::Opts::new(
"db_pool_exhaustion_total",
"Number of times the connection pool was exhausted, by pool name",
),
&["pool"],
)
.context("db_pool_exhaustion metric")?;

let email_dlq_size = IntGauge::new(
"email_dlq_size",
"Number of email jobs currently in the dead-letter queue",
Expand Down Expand Up @@ -126,7 +149,9 @@ impl Metrics {
registry.register(Box::new(request_latency.clone()))?;
registry.register(Box::new(rpc_errors.clone()))?;
registry.register(Box::new(rpc_fallbacks.clone()))?;
registry.register(Box::new(db_query_duration.clone()))?;
registry.register(Box::new(db_timeouts.clone()))?;
registry.register(Box::new(db_pool_exhaustion.clone()))?;
registry.register(Box::new(email_dlq_size.clone()))?;
registry.register(Box::new(db_pool_connections_active.clone()))?;
registry.register(Box::new(db_pool_connections_idle.clone()))?;
Expand All @@ -141,7 +166,9 @@ impl Metrics {
request_latency,
rpc_errors,
rpc_fallbacks,
db_query_duration,
db_timeouts,
db_pool_exhaustion,
email_dlq_size,
db_pool_connections_active,
db_pool_connections_idle,
Expand Down Expand Up @@ -183,10 +210,22 @@ impl Metrics {
self.rpc_fallbacks.with_label_values(&[endpoint]).inc();
}

pub fn observe_db_query_duration(&self, query_name: &str, duration: Duration) {
self.db_query_duration
.with_label_values(&[query_name])
.observe(duration.as_secs_f64());
}

pub fn observe_db_timeout(&self, operation: &str) {
self.db_timeouts.with_label_values(&[operation]).inc();
}

pub fn observe_db_pool_exhaustion(&self, pool: &str) {
self.db_pool_exhaustion
.with_label_values(&[pool])
.inc();
}

pub fn set_dlq_size(&self, n: i64) {
self.email_dlq_size.set(n);
}
Expand Down Expand Up @@ -234,3 +273,31 @@ impl Metrics {
Ok(String::from_utf8(buffer)?)
}
}

Ok(String::from_utf8(buffer)?)
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn observe_db_query_duration_records_histogram() {
let metrics = Metrics::new().unwrap();
metrics.observe_db_query_duration("test_query", Duration::from_millis(100));
let output = metrics.render().unwrap();
assert!(output.contains("db_query_duration_seconds"));
assert!(output.contains("query_name=\"test_query\""));
}

#[test]
fn observe_db_pool_exhaustion_increments_counter() {
let metrics = Metrics::new().unwrap();
metrics.observe_db_pool_exhaustion("api");
let output = metrics.render().unwrap();
assert!(output.contains("db_pool_exhaustion_total"));
assert!(output.contains("pool=\"api\""));
assert!(output.contains("1"));
}
}