11# Monitoring Module - CloudWatch Dashboards and Alarms
22
3- # ############
4- # SNS Topic for Alerts
5- # ############
6- resource "aws_sns_topic" "alerts" {
7- name = " ${ var . project_name } -${ var . environment } -alerts"
8-
9- tags = {
10- Name = " ${ var . project_name } -${ var . environment } -alerts"
11- Environment = var.environment
12- Project = var.project_name
13- }
14- }
15-
16- # TODO: Add email subscriptions
17- # resource "aws_sns_topic_subscription" "alerts_email" {
18- # topic_arn = aws_sns_topic.alerts.arn
19- # protocol = "email"
20- # endpoint = "your-email@example.com"
21- # }
22-
233# ############
244# CloudWatch Dashboard
255# ############
@@ -28,64 +8,123 @@ resource "aws_cloudwatch_dashboard" "main" {
288
299 dashboard_body = jsonencode ({
3010 widgets = [
31- # EB CPU Utilization
11+ # EC2 CPU Utilization
3212 {
3313 type = " metric"
3414 properties = {
3515 metrics = [
36- [" AWS/ElasticBeanstalk " , " EnvironmentHealth " , { stat = " Average" }]
16+ [" AWS/EC2 " , " CPUUtilization " , " AutoScalingGroupName " , var.eb_autoscaling_group_name , { stat = " Average" }]
3717 ]
3818 period = 300
3919 stat = " Average"
4020 region = var.aws_region
41- title = " Environment Health"
21+ title = " EC2 CPU Utilization (%)"
22+ yAxis = {
23+ left = {
24+ min = 0
25+ max = 100
26+ }
27+ }
4228 }
4329 },
44- # EB Memory Utilization
30+ # EC2 Memory Utilization
4531 {
4632 type = " metric"
4733 properties = {
4834 metrics = [
49- [" CWAgent" , " mem_used_percent" , " AutoScalingGroupName" , var.eb_autoscaling_group_name]
35+ [" CWAgent" , " mem_used_percent" , " AutoScalingGroupName" , var.eb_autoscaling_group_name, { stat = " Average " } ]
5036 ]
5137 period = 300
5238 stat = " Average"
5339 region = var.aws_region
54- title = " Memory Utilization (%)"
40+ title = " EC2 Memory Utilization (%)"
41+ yAxis = {
42+ left = {
43+ min = 0
44+ max = 100
45+ }
46+ }
5547 }
5648 },
5749 # EB Request Count
5850 {
5951 type = " metric"
6052 properties = {
6153 metrics = [
62- [" AWS/ElasticBeanstalk" , " RequestCount" , { stat = " Sum" }]
54+ [" AWS/ElasticBeanstalk" , " RequestCount" , " EnvironmentName " , var.eb_environment_name, { stat = " Sum" }]
6355 ]
6456 period = 300
6557 stat = " Sum"
6658 region = var.aws_region
6759 title = " Request Count"
6860 }
6961 },
70- # RDS CPU
62+ # HTTP 5xx Errors
7163 {
7264 type = " metric"
7365 properties = {
7466 metrics = [
75- [" AWS/RDS" , " CPUUtilization" , " DBInstanceIdentifier" , var.rds_instance_id]
67+ [" AWS/ElasticBeanstalk" , " ApplicationRequests5xx" , " EnvironmentName" , var.eb_environment_name, { stat = " Sum" }]
68+ ]
69+ period = 300
70+ stat = " Sum"
71+ region = var.aws_region
72+ title = " HTTP 5xx Errors"
73+ }
74+ },
75+ # RDS CPU Utilization
76+ {
77+ type = " metric"
78+ properties = {
79+ metrics = [
80+ [" AWS/RDS" , " CPUUtilization" , " DBInstanceIdentifier" , var.rds_instance_id, { stat = " Average" }]
7681 ]
7782 period = 300
7883 stat = " Average"
7984 region = var.aws_region
80- title = " RDS CPU Utilization"
85+ title = " RDS CPU Utilization (%)"
86+ yAxis = {
87+ left = {
88+ min = 0
89+ max = 100
90+ }
91+ }
8192 }
8293 },
83- # RDS Connections
94+ # RDS Read/Write IOPS
8495 {
8596 type = " metric"
8697 properties = {
8798 metrics = [
88- [" AWS/RDS" , " DatabaseConnections" , " DBInstanceIdentifier" , var.rds_instance_id]
99+ [" AWS/RDS" , " ReadIOPS" , " DBInstanceIdentifier" , var.rds_instance_id, { stat = " Average" , label = " Read IOPS" }],
100+ [" ." , " WriteIOPS" , " ." , " ." , { stat = " Average" , label = " Write IOPS" }]
101+ ]
102+ period = 300
103+ stat = " Average"
104+ region = var.aws_region
105+ title = " RDS Read/Write IOPS"
106+ }
107+ },
108+ # RDS Network Throughput
109+ {
110+ type = " metric"
111+ properties = {
112+ metrics = [
113+ [" AWS/RDS" , " NetworkReceiveThroughput" , " DBInstanceIdentifier" , var.rds_instance_id, { stat = " Average" , label = " Network In" }],
114+ [" ." , " NetworkTransmitThroughput" , " ." , " ." , { stat = " Average" , label = " Network Out" }]
115+ ]
116+ period = 300
117+ stat = " Average"
118+ region = var.aws_region
119+ title = " RDS Network Throughput (Bytes/sec)"
120+ }
121+ },
122+ # RDS Database Connections
123+ {
124+ type = " metric"
125+ properties = {
126+ metrics = [
127+ [" AWS/RDS" , " DatabaseConnections" , " DBInstanceIdentifier" , var.rds_instance_id, { stat = " Average" }]
89128 ]
90129 period = 300
91130 stat = " Average"
@@ -98,12 +137,12 @@ resource "aws_cloudwatch_dashboard" "main" {
98137 type = " metric"
99138 properties = {
100139 metrics = [
101- [" AWS/RDS" , " FreeableMemory" , " DBInstanceIdentifier" , var.rds_instance_id]
140+ [" AWS/RDS" , " FreeableMemory" , " DBInstanceIdentifier" , var.rds_instance_id, { stat = " Average " } ]
102141 ]
103142 period = 300
104143 stat = " Average"
105144 region = var.aws_region
106- title = " RDS Freeable Memory"
145+ title = " RDS Freeable Memory (Bytes) "
107146 }
108147 }
109148 ]
@@ -125,7 +164,7 @@ resource "aws_cloudwatch_metric_alarm" "eb_cpu_high" {
125164 statistic = " Average"
126165 threshold = 80
127166 alarm_description = " This metric monitors EC2 CPU utilization"
128- alarm_actions = [aws_sns_topic . alerts . arn ]
167+ alarm_actions = [var . sns_topic_arn ]
129168
130169 dimensions = {
131170 AutoScalingGroupName = var.eb_autoscaling_group_name
@@ -137,15 +176,7 @@ resource "aws_cloudwatch_metric_alarm" "eb_cpu_high" {
137176 }
138177}
139178
140- # NOTE: Memory alarms below require CloudWatch Agent to be installed on EB instances.
141- # To enable memory monitoring:
142- # 1. Create .ebextensions/cloudwatch-agent.config in your app
143- # 2. Follow AWS docs: https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/Install-CloudWatch-Agent.html
144- # 3. Uncomment the alarms below
145-
146- # High Memory Alarm (requires CloudWatch Agent) - COMMENTED OUT
147- # Uncomment after installing CloudWatch Agent
148- /*
179+ # High Memory Alarm
149180resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
150181 alarm_name = " ${ var . project_name } -${ var . environment } -eb-memory-high"
151182 comparison_operator = " GreaterThanThreshold"
@@ -156,7 +187,7 @@ resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
156187 statistic = " Average"
157188 threshold = 80
158189 alarm_description = " This metric monitors EC2 memory utilization"
159- alarm_actions = [aws_sns_topic.alerts.arn ]
190+ alarm_actions = [var . sns_topic_arn ]
160191
161192 dimensions = {
162193 AutoScalingGroupName = var.eb_autoscaling_group_name
@@ -167,11 +198,8 @@ resource "aws_cloudwatch_metric_alarm" "eb_memory_high" {
167198 Project = var.project_name
168199 }
169200}
170- */
171201
172- # Critical Memory Alarm (requires CloudWatch Agent) - COMMENTED OUT
173- # Uncomment after installing CloudWatch Agent
174- /*
202+ # Critical Memory Alarm
175203resource "aws_cloudwatch_metric_alarm" "eb_memory_critical" {
176204 alarm_name = " ${ var . project_name } -${ var . environment } -eb-memory-critical"
177205 comparison_operator = " GreaterThanThreshold"
@@ -182,7 +210,7 @@ resource "aws_cloudwatch_metric_alarm" "eb_memory_critical" {
182210 statistic = " Average"
183211 threshold = 90
184212 alarm_description = " This metric monitors EC2 memory utilization - CRITICAL"
185- alarm_actions = [aws_sns_topic.alerts.arn ]
213+ alarm_actions = [var . sns_topic_arn ]
186214
187215 dimensions = {
188216 AutoScalingGroupName = var.eb_autoscaling_group_name
@@ -193,20 +221,20 @@ resource "aws_cloudwatch_metric_alarm" "eb_memory_critical" {
193221 Project = var.project_name
194222 }
195223}
196- */
197-
198- # Environment Health Alarm
199- resource "aws_cloudwatch_metric_alarm" "eb_environment_health " {
200- alarm_name = " ${ var . project_name } -${ var . environment } -eb-health-degraded "
201- comparison_operator = " LessThanThreshold "
202- evaluation_periods = 1
203- metric_name = " EnvironmentHealth "
224+
225+ # HTTP 5xx Error Rate Alarm
226+ # This monitors server errors which indicate application health issues
227+ resource "aws_cloudwatch_metric_alarm" "eb_http_5xx_errors " {
228+ alarm_name = " ${ var . project_name } -${ var . environment } -eb-http-5xx-high "
229+ comparison_operator = " GreaterThanThreshold "
230+ evaluation_periods = 2
231+ metric_name = " ApplicationRequests5xx "
204232 namespace = " AWS/ElasticBeanstalk"
205233 period = 300
206- statistic = " Average "
207- threshold = 15 # Healthy = 25, Warning = 15, Degraded = 10
208- alarm_description = " Environment health is degraded "
209- alarm_actions = [aws_sns_topic . alerts . arn ]
234+ statistic = " Sum "
235+ threshold = 10 # Alert if more than 10 5xx errors in 5 minutes
236+ alarm_description = " High rate of HTTP 5xx errors indicates application issues "
237+ alarm_actions = [var . sns_topic_arn ]
210238
211239 dimensions = {
212240 EnvironmentName = var.eb_environment_name
0 commit comments