Skip to content

Commit 367d8bc

Browse files
authored
Merge pull request #3808 from Northeastern-Electric-Racing/cloudwatch-dashboard
Cloudwatch dashboard
2 parents 11e81bc + e21ee2d commit 367d8bc

7 files changed

Lines changed: 215 additions & 64 deletions

File tree

Lines changed: 9 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
files:
2-
"/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json":
3-
mode: "000644"
2+
"/opt/aws/amazon-cloudwatch-agent/bin/config.json":
3+
mode: "000600"
44
owner: root
55
group: root
66
content: |
@@ -11,6 +11,10 @@ files:
1111
},
1212
"metrics": {
1313
"namespace": "CWAgent",
14+
"append_dimensions": {
15+
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
16+
"InstanceId": "${aws:InstanceId}"
17+
},
1418
"metrics_collected": {
1519
"mem": {
1620
"measurement": [
@@ -35,29 +39,10 @@ files:
3539
"*"
3640
]
3741
}
38-
},
39-
"append_dimensions": {
40-
"AutoScalingGroupName": "${aws:AutoScalingGroupName}",
41-
"InstanceId": "${aws:InstanceId}"
4242
}
4343
}
4444
}
4545

46-
commands:
47-
01_install_cloudwatch_agent:
48-
command: |
49-
if ! command -v amazon-cloudwatch-agent-ctl &> /dev/null; then
50-
wget -q https://s3.amazonaws.com/amazoncloudwatch-agent/amazon_linux/amd64/latest/amazon-cloudwatch-agent.rpm
51-
rpm -U ./amazon-cloudwatch-agent.rpm
52-
rm -f ./amazon-cloudwatch-agent.rpm
53-
fi
54-
test: "[ ! -f /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl ]"
55-
56-
02_stop_cloudwatch_agent:
57-
command: |
58-
/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl \
59-
-a fetch-config \
60-
-m ec2 \
61-
-c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json \
62-
-s
63-
ignoreErrors: true
46+
container_commands:
47+
start_cloudwatch_agent:
48+
command: /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -s -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json

infrastructure/environments/production/main.tf

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@ provider "aws" {
3434
locals {
3535
project_name = "finishline"
3636
environment = "production"
37+
38+
# Extract ALB ARN suffix for CloudWatch metrics
39+
# Full ARN: arn:aws:elasticloadbalancing:region:account:loadbalancer/app/name/id
40+
# Suffix needed: app/name/id
41+
alb_arn_suffix = try(split("loadbalancer/", data.aws_lb.eb_alb.arn)[1], "")
3742
}
3843

3944
#############
@@ -253,6 +258,31 @@ resource "aws_sns_topic" "alerts" {
253258
}
254259
}
255260

261+
#############
262+
# Data Source: Fetch Autoscaling Group Name
263+
#############
264+
# Query AWS directly to get the actual autoscaling group name
265+
data "aws_autoscaling_groups" "eb_asg" {
266+
filter {
267+
name = "tag:elasticbeanstalk:environment-name"
268+
values = [module.elasticbeanstalk.environment_name]
269+
}
270+
271+
depends_on = [module.elasticbeanstalk]
272+
}
273+
274+
#############
275+
# Data Source: Fetch Application Load Balancer
276+
#############
277+
# Query AWS directly to get the actual ALB for metrics
278+
data "aws_lb" "eb_alb" {
279+
tags = {
280+
"elasticbeanstalk:environment-name" = module.elasticbeanstalk.environment_name
281+
}
282+
283+
depends_on = [module.elasticbeanstalk]
284+
}
285+
256286
#############
257287
# Monitoring Module
258288
#############
@@ -263,7 +293,8 @@ module "monitoring" {
263293
environment = local.environment
264294
aws_region = var.aws_region
265295
eb_environment_name = module.elasticbeanstalk.environment_name
266-
eb_autoscaling_group_name = module.elasticbeanstalk.autoscaling_groups[0]
296+
eb_autoscaling_group_name = data.aws_autoscaling_groups.eb_asg.names[0]
297+
alb_arn_suffix = local.alb_arn_suffix
267298
rds_instance_id = module.rds.db_instance_id
268299
log_retention_days = 30
269300
sns_topic_arn = aws_sns_topic.alerts.arn

infrastructure/modules/iam/main.tf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,18 @@ resource "aws_iam_role_policy" "eb_ecr_access" {
163163
})
164164
}
165165

166+
# Attach AWS managed policy for CloudWatch Agent
167+
# This policy includes permissions for:
168+
# - cloudwatch:PutMetricData
169+
# - ec2:DescribeVolumes, ec2:DescribeTags
170+
# - logs:* (CreateLogGroup, CreateLogStream, PutLogEvents, etc.)
171+
# - xray:* (for traces)
172+
# - ssm:GetParameter (for configs in Parameter Store)
173+
resource "aws_iam_role_policy_attachment" "eb_cloudwatch_agent" {
174+
role = aws_iam_role.eb_ec2_role.name
175+
policy_arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy"
176+
}
177+
166178
# EC2 Instance Profile
167179
resource "aws_iam_instance_profile" "eb_ec2_profile" {
168180
name = "${var.project_name}-${var.environment}-eb-ec2-profile"

infrastructure/modules/monitoring/main.tf

Lines changed: 147 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,14 @@ resource "aws_cloudwatch_dashboard" "main" {
2727
}
2828
}
2929
},
30-
# EC2 Memory Utilization
30+
# EC2 Memory Utilization (Custom Metric)
3131
{
3232
type = "metric"
3333
properties = {
3434
metrics = [
35-
["CWAgent", "mem_used_percent", "AutoScalingGroupName", var.eb_autoscaling_group_name, { stat = "Average" }]
35+
[{ expression = "SELECT AVG(MemoryUtilization) FROM \"CWAgent\"", id = "m1" }]
3636
]
3737
period = 300
38-
stat = "Average"
3938
region = var.aws_region
4039
title = "EC2 Memory Utilization (%)"
4140
yAxis = {
@@ -46,25 +45,42 @@ resource "aws_cloudwatch_dashboard" "main" {
4645
}
4746
}
4847
},
49-
# EB Request Count
48+
# EC2 Disk Utilization (Custom Metric) - Root filesystem
5049
{
5150
type = "metric"
5251
properties = {
5352
metrics = [
54-
["AWS/ElasticBeanstalk", "RequestCount", "EnvironmentName", var.eb_environment_name, { stat = "Sum" }]
53+
[{ expression = "SELECT AVG(DiskUtilization) FROM \"CWAgent\" WHERE path = '/'", id = "m1" }]
54+
]
55+
period = 300
56+
region = var.aws_region
57+
title = "EC2 Disk Utilization (%) - Root"
58+
yAxis = {
59+
left = {
60+
min = 0
61+
max = 100
62+
}
63+
}
64+
}
65+
},
66+
{
67+
type = "metric"
68+
properties = {
69+
metrics = [
70+
["AWS/ApplicationELB", "RequestCount", "LoadBalancer", var.alb_arn_suffix, { stat = "Sum" }]
5571
]
5672
period = 300
5773
stat = "Sum"
5874
region = var.aws_region
5975
title = "Request Count"
6076
}
6177
},
62-
# HTTP 5xx Errors
78+
# HTTP 5xx Errors (Target Responses)
6379
{
6480
type = "metric"
6581
properties = {
6682
metrics = [
67-
["AWS/ElasticBeanstalk", "ApplicationRequests5xx", "EnvironmentName", var.eb_environment_name, { stat = "Sum" }]
83+
["AWS/ApplicationELB", "HTTPCode_Target_5XX_Count", "LoadBalancer", var.alb_arn_suffix, { stat = "Sum" }]
6884
]
6985
period = 300
7086
stat = "Sum"
@@ -144,6 +160,47 @@ resource "aws_cloudwatch_dashboard" "main" {
144160
region = var.aws_region
145161
title = "RDS Freeable Memory (Bytes)"
146162
}
163+
},
164+
# RDS Read/Write Latency
165+
{
166+
type = "metric"
167+
properties = {
168+
metrics = [
169+
["AWS/RDS", "ReadLatency", "DBInstanceIdentifier", var.rds_instance_id, { stat = "Average", label = "Read Latency" }],
170+
[".", "WriteLatency", ".", ".", { stat = "Average", label = "Write Latency" }]
171+
]
172+
period = 300
173+
stat = "Average"
174+
region = var.aws_region
175+
title = "RDS Read/Write Latency (ms)"
176+
}
177+
},
178+
# RDS Queue Depth
179+
{
180+
type = "metric"
181+
properties = {
182+
metrics = [
183+
["AWS/RDS", "DiskQueueDepth", "DBInstanceIdentifier", var.rds_instance_id, { stat = "Average" }]
184+
]
185+
period = 300
186+
stat = "Average"
187+
region = var.aws_region
188+
title = "RDS Disk Queue Depth"
189+
}
190+
},
191+
# RDS Throughput (MB/s)
192+
{
193+
type = "metric"
194+
properties = {
195+
metrics = [
196+
["AWS/RDS", "ReadThroughput", "DBInstanceIdentifier", var.rds_instance_id, { stat = "Average", label = "Read Throughput" }],
197+
[".", "WriteThroughput", ".", ".", { stat = "Average", label = "Write Throughput" }]
198+
]
199+
period = 300
200+
stat = "Average"
201+
region = var.aws_region
202+
title = "RDS Disk Throughput (Bytes/sec)"
203+
}
147204
}
148205
]
149206
})
@@ -176,21 +233,22 @@ resource "aws_cloudwatch_metric_alarm" "eb_cpu_high" {
176233
}
177234
}
178235

179-
# High Memory Alarm
180-
resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
181-
alarm_name = "${var.project_name}-${var.environment}-eb-memory-high"
236+
# HTTP 5xx Error Rate Alarm
237+
# This monitors server errors which indicate application health issues
238+
resource "aws_cloudwatch_metric_alarm" "alb_http_5xx_errors" {
239+
alarm_name = "${var.project_name}-${var.environment}-alb-http-5xx-high"
182240
comparison_operator = "GreaterThanThreshold"
183241
evaluation_periods = 2
184-
metric_name = "mem_used_percent"
185-
namespace = "CWAgent"
242+
metric_name = "HTTPCode_Target_5XX_Count"
243+
namespace = "AWS/ApplicationELB"
186244
period = 300
187-
statistic = "Average"
188-
threshold = 80
189-
alarm_description = "This metric monitors EC2 memory utilization"
245+
statistic = "Sum"
246+
threshold = 10 # Alert if more than 10 5xx errors in 5 minutes
247+
alarm_description = "High rate of HTTP 5xx errors indicates application issues"
190248
alarm_actions = [var.sns_topic_arn]
191249

192250
dimensions = {
193-
AutoScalingGroupName = var.eb_autoscaling_group_name
251+
LoadBalancer = var.alb_arn_suffix
194252
}
195253

196254
tags = {
@@ -199,21 +257,25 @@ resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
199257
}
200258
}
201259

202-
# Critical Memory Alarm
203-
resource "aws_cloudwatch_metric_alarm" "eb_memory_critical" {
204-
alarm_name = "${var.project_name}-${var.environment}-eb-memory-critical"
260+
#############
261+
# RDS CloudWatch Alarms
262+
#############
263+
264+
# High RDS CPU Alarm
265+
resource "aws_cloudwatch_metric_alarm" "rds_cpu_high" {
266+
alarm_name = "${var.project_name}-${var.environment}-rds-cpu-high"
205267
comparison_operator = "GreaterThanThreshold"
206268
evaluation_periods = 2
207-
metric_name = "mem_used_percent"
208-
namespace = "CWAgent"
269+
metric_name = "CPUUtilization"
270+
namespace = "AWS/RDS"
209271
period = 300
210272
statistic = "Average"
211-
threshold = 90
212-
alarm_description = "This metric monitors EC2 memory utilization - CRITICAL"
273+
threshold = 75
274+
alarm_description = "RDS CPU utilization is high - may need optimization or larger instance"
213275
alarm_actions = [var.sns_topic_arn]
214276

215277
dimensions = {
216-
AutoScalingGroupName = var.eb_autoscaling_group_name
278+
DBInstanceIdentifier = var.rds_instance_id
217279
}
218280

219281
tags = {
@@ -222,22 +284,73 @@ resource "aws_cloudwatch_metric_alarm" "eb_memory_critical" {
222284
}
223285
}
224286

225-
# HTTP 5xx Error Rate Alarm
226-
# This monitors server errors which indicate application health issues
227-
resource "aws_cloudwatch_metric_alarm" "eb_http_5xx_errors" {
228-
alarm_name = "${var.project_name}-${var.environment}-eb-http-5xx-high"
287+
# High RDS Read Latency Alarm
288+
resource "aws_cloudwatch_metric_alarm" "rds_read_latency_high" {
289+
alarm_name = "${var.project_name}-${var.environment}-rds-read-latency-high"
229290
comparison_operator = "GreaterThanThreshold"
230291
evaluation_periods = 2
231-
metric_name = "ApplicationRequests5xx"
232-
namespace = "AWS/ElasticBeanstalk"
292+
metric_name = "ReadLatency"
293+
namespace = "AWS/RDS"
233294
period = 300
234-
statistic = "Sum"
235-
threshold = 10 # Alert if more than 10 5xx errors in 5 minutes
236-
alarm_description = "High rate of HTTP 5xx errors indicates application issues"
295+
statistic = "Average"
296+
threshold = 0.01 # 10ms in seconds
297+
alarm_description = "RDS read latency is high - may indicate I/O bottleneck or need for indexing"
298+
alarm_actions = [var.sns_topic_arn]
299+
300+
dimensions = {
301+
DBInstanceIdentifier = var.rds_instance_id
302+
}
303+
304+
tags = {
305+
Environment = var.environment
306+
Project = var.project_name
307+
}
308+
}
309+
310+
# Low RDS Freeable Memory Alarm
311+
resource "aws_cloudwatch_metric_alarm" "rds_memory_low" {
312+
alarm_name = "${var.project_name}-${var.environment}-rds-memory-low"
313+
comparison_operator = "LessThanThreshold"
314+
evaluation_periods = 2
315+
metric_name = "FreeableMemory"
316+
namespace = "AWS/RDS"
317+
period = 300
318+
statistic = "Average"
319+
threshold = 524288000 # 500MB in bytes
320+
alarm_description = "RDS freeable memory is low - may need larger instance or query optimization"
237321
alarm_actions = [var.sns_topic_arn]
238322

239323
dimensions = {
240-
EnvironmentName = var.eb_environment_name
324+
DBInstanceIdentifier = var.rds_instance_id
325+
}
326+
327+
tags = {
328+
Environment = var.environment
329+
Project = var.project_name
330+
}
331+
}
332+
333+
# High Memory Alarm
334+
resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
335+
alarm_name = "${var.project_name}-${var.environment}-eb-memory-high"
336+
comparison_operator = "GreaterThanThreshold"
337+
evaluation_periods = 2
338+
threshold = 75
339+
alarm_description = "This metric monitors EC2 memory utilization"
340+
alarm_actions = [var.sns_topic_arn]
341+
342+
metric_query {
343+
id = "m1"
344+
return_data = true
345+
metric {
346+
namespace = "CWAgent"
347+
metric_name = "MemoryUtilization"
348+
period = 300
349+
stat = "Average"
350+
dimensions = {
351+
AutoScalingGroupName = var.eb_autoscaling_group_name
352+
}
353+
}
241354
}
242355

243356
tags = {

0 commit comments

Comments
 (0)