Skip to content

Commit e21ee2d

Browse files
committed
more stats and alarms
1 parent 87e7794 commit e21ee2d

1 file changed

Lines changed: 143 additions & 0 deletions

File tree

  • infrastructure/modules/monitoring

infrastructure/modules/monitoring/main.tf

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,47 @@ resource "aws_cloudwatch_dashboard" "main" {
160160
region = var.aws_region
161161
title = "RDS Freeable Memory (Bytes)"
162162
}
163+
},
164+
# RDS Read/Write Latency
165+
{
166+
type = "metric"
167+
properties = {
168+
metrics = [
169+
["AWS/RDS", "ReadLatency", "DBInstanceIdentifier", var.rds_instance_id, { stat = "Average", label = "Read Latency" }],
170+
[".", "WriteLatency", ".", ".", { stat = "Average", label = "Write Latency" }]
171+
]
172+
period = 300
173+
stat = "Average"
174+
region = var.aws_region
175+
title = "RDS Read/Write Latency (ms)"
176+
}
177+
},
178+
# RDS Queue Depth
179+
{
180+
type = "metric"
181+
properties = {
182+
metrics = [
183+
["AWS/RDS", "DiskQueueDepth", "DBInstanceIdentifier", var.rds_instance_id, { stat = "Average" }]
184+
]
185+
period = 300
186+
stat = "Average"
187+
region = var.aws_region
188+
title = "RDS Disk Queue Depth"
189+
}
190+
},
191+
# RDS Throughput (MB/s)
192+
{
193+
type = "metric"
194+
properties = {
195+
metrics = [
196+
["AWS/RDS", "ReadThroughput", "DBInstanceIdentifier", var.rds_instance_id, { stat = "Average", label = "Read Throughput" }],
197+
[".", "WriteThroughput", ".", ".", { stat = "Average", label = "Write Throughput" }]
198+
]
199+
period = 300
200+
stat = "Average"
201+
region = var.aws_region
202+
title = "RDS Disk Throughput (Bytes/sec)"
203+
}
163204
}
164205
]
165206
})
@@ -216,6 +257,108 @@ resource "aws_cloudwatch_metric_alarm" "alb_http_5xx_errors" {
216257
}
217258
}
218259

260+
#############
261+
# RDS CloudWatch Alarms
262+
#############
263+
264+
# High RDS CPU Alarm
265+
resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
266+
alarm_name = "${var.project_name}-${var.environment}-rds-cpu-high"
267+
comparison_operator = "GreaterThanThreshold"
268+
evaluation_periods = 2
269+
metric_name = "CPUUtilization"
270+
namespace = "AWS/RDS"
271+
period = 300
272+
statistic = "Average"
273+
threshold = 75
274+
alarm_description = "RDS CPU utilization is high - may need optimization or larger instance"
275+
alarm_actions = [var.sns_topic_arn]
276+
277+
dimensions = {
278+
DBInstanceIdentifier = var.rds_instance_id
279+
}
280+
281+
tags = {
282+
Environment = var.environment
283+
Project = var.project_name
284+
}
285+
}
286+
287+
# High RDS Read Latency Alarm
288+
resource "aws_cloudwatch_metric_alarm" "rds_read_latency_high" {
289+
alarm_name = "${var.project_name}-${var.environment}-rds-read-latency-high"
290+
comparison_operator = "GreaterThanThreshold"
291+
evaluation_periods = 2
292+
metric_name = "ReadLatency"
293+
namespace = "AWS/RDS"
294+
period = 300
295+
statistic = "Average"
296+
threshold = 0.01 # 10ms in seconds
297+
alarm_description = "RDS read latency is high - may indicate I/O bottleneck or need for indexing"
298+
alarm_actions = [var.sns_topic_arn]
299+
300+
dimensions = {
301+
DBInstanceIdentifier = var.rds_instance_id
302+
}
303+
304+
tags = {
305+
Environment = var.environment
306+
Project = var.project_name
307+
}
308+
}
309+
310+
# Low RDS Freeable Memory Alarm
311+
resource "aws_cloudwatch_metric_alarm" "rds_memory_low" {
312+
alarm_name = "${var.project_name}-${var.environment}-rds-memory-low"
313+
comparison_operator = "LessThanThreshold"
314+
evaluation_periods = 2
315+
metric_name = "FreeableMemory"
316+
namespace = "AWS/RDS"
317+
period = 300
318+
statistic = "Average"
319+
threshold = 524288000 # 500MB in bytes
320+
alarm_description = "RDS freeable memory is low - may need larger instance or query optimization"
321+
alarm_actions = [var.sns_topic_arn]
322+
323+
dimensions = {
324+
DBInstanceIdentifier = var.rds_instance_id
325+
}
326+
327+
tags = {
328+
Environment = var.environment
329+
Project = var.project_name
330+
}
331+
}
332+
333+
# High Memory Alarm
334+
resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
335+
alarm_name = "${var.project_name}-${var.environment}-eb-memory-high"
336+
comparison_operator = "GreaterThanThreshold"
337+
evaluation_periods = 2
338+
threshold = 75
339+
alarm_description = "This metric monitors EC2 memory utilization"
340+
alarm_actions = [var.sns_topic_arn]
341+
342+
metric_query {
343+
id = "m1"
344+
return_data = true
345+
metric {
346+
namespace = "CWAgent"
347+
metric_name = "MemoryUtilization"
348+
period = 300
349+
stat = "Average"
350+
dimensions = {
351+
AutoScalingGroupName = var.eb_autoscaling_group_name
352+
}
353+
}
354+
}
355+
356+
tags = {
357+
Environment = var.environment
358+
Project = var.project_name
359+
}
360+
}
361+
219362
#############
220363
# Log Groups
221364
#############

0 commit comments

Comments
 (0)