Skip to content

Commit 04d1526

Browse files
Merge pull request #151 from NHSDigital/CCM-14044_EventAnnomAlarms
CCM-14044 event annom alarms
2 parents c49f0e5 + 038a02d commit 04d1526

4 files changed

Lines changed: 79 additions & 0 deletions

File tree

infrastructure/terraform/modules/eventpub/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,15 @@
1818
| <a name="input_data_plane_bus_arn"></a> [data\_plane\_bus\_arn](#input\_data\_plane\_bus\_arn) | Data plane event bus arn | `string` | n/a | yes |
1919
| <a name="input_default_tags"></a> [default\_tags](#input\_default\_tags) | Default tag map for application to all taggable resources in the module | `map(string)` | `{}` | no |
2020
| <a name="input_enable_event_cache"></a> [enable\_event\_cache](#input\_enable\_event\_cache) | Enable caching of events to an S3 bucket | `bool` | `false` | no |
21+
| <a name="input_enable_event_publishing_anomaly_detection"></a> [enable\_event\_publishing\_anomaly\_detection](#input\_enable\_event\_publishing\_anomaly\_detection) | Enable CloudWatch anomaly detection alarm for SNS message publishing. Detects abnormal drops or spikes in event publishing volume. | `bool` | `true` | no |
2122
| <a name="input_enable_firehose_raw_message_delivery"></a> [enable\_firehose\_raw\_message\_delivery](#input\_enable\_firehose\_raw\_message\_delivery) | Enables raw message delivery on firehose subscription | `bool` | `false` | no |
2223
| <a name="input_enable_sns_delivery_logging"></a> [enable\_sns\_delivery\_logging](#input\_enable\_sns\_delivery\_logging) | Enable SNS Delivery Failure Notifications | `bool` | `false` | no |
2324
| <a name="input_environment"></a> [environment](#input\_environment) | The name of the terraformscaffold environment the module is called for | `string` | n/a | yes |
2425
| <a name="input_event_cache_buffer_interval"></a> [event\_cache\_buffer\_interval](#input\_event\_cache\_buffer\_interval) | The buffer interval for data firehose | `number` | `500` | no |
2526
| <a name="input_event_cache_expiry_days"></a> [event\_cache\_expiry\_days](#input\_event\_cache\_expiry\_days) | s3 archiving expiry in days | `number` | `30` | no |
27+
| <a name="input_event_publishing_anomaly_band_width"></a> [event\_publishing\_anomaly\_band\_width](#input\_event\_publishing\_anomaly\_band\_width) | The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. Recommended: 2-4. | `number` | `5` | no |
28+
| <a name="input_event_publishing_anomaly_evaluation_periods"></a> [event\_publishing\_anomaly\_evaluation\_periods](#input\_event\_publishing\_anomaly\_evaluation\_periods) | Number of evaluation periods for the publishing anomaly alarm. Each period is defined by event\_publishing\_anomaly\_period. | `number` | `3` | no |
29+
| <a name="input_event_publishing_anomaly_period"></a> [event\_publishing\_anomaly\_period](#input\_event\_publishing\_anomaly\_period) | The period in seconds over which the specified statistic is applied for anomaly detection. Minimum 300 seconds (5 minutes). Recommended: 300-600. | `number` | `300` | no |
2630
| <a name="input_force_destroy"></a> [force\_destroy](#input\_force\_destroy) | When enabled will force destroy event-cache S3 bucket | `bool` | `false` | no |
2731
| <a name="input_group"></a> [group](#input\_group) | The name of the tfscaffold group | `string` | `null` | no |
2832
| <a name="input_iam_permissions_boundary_arn"></a> [iam\_permissions\_boundary\_arn](#input\_iam\_permissions\_boundary\_arn) | The ARN of the permissions boundary to use for the IAM role | `string` | `null` | no |
@@ -42,6 +46,7 @@
4246

4347
| Name | Description |
4448
|------|-------------|
49+
| <a name="output_publishing_anomaly_alarm"></a> [publishing\_anomaly\_alarm](#output\_publishing\_anomaly\_alarm) | CloudWatch anomaly detection alarm details for SNS publishing |
4550
| <a name="output_s3_bucket_event_cache"></a> [s3\_bucket\_event\_cache](#output\_s3\_bucket\_event\_cache) | S3 Bucket ARN and Name for event cache |
4651
| <a name="output_sns_topic"></a> [sns\_topic](#output\_sns\_topic) | SNS Topic ARN and Name |
4752
<!-- vale on -->
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
resource "aws_cloudwatch_metric_alarm" "publishing_anomaly" {
2+
count = var.enable_event_publishing_anomaly_detection ? 1 : 0
3+
4+
alarm_name = "${local.csi}-sns-publishing-anomaly"
5+
alarm_description = "RELIABILITY: Anomaly detection alarm for abnormal SNS message publishing patterns. Detects unexpected drops or spikes in event publishing volume that may indicate service degradation or misconfiguration."
6+
comparison_operator = "LessThanLowerOrGreaterThanUpperThreshold"
7+
evaluation_periods = var.event_publishing_anomaly_evaluation_periods # Number of evaluation periods for the publishing anomaly alarm.
8+
threshold_metric_id = "ad1"
9+
treat_missing_data = "notBreaching"
10+
actions_enabled = true
11+
12+
tags = merge(
13+
local.default_tags,
14+
{
15+
AlarmType = "AnomalyDetection"
16+
AlarmPurpose = "EventPublishingAbnormality"
17+
}
18+
)
19+
20+
metric_query {
21+
id = "m1"
22+
return_data = true
23+
24+
metric {
25+
metric_name = "NumberOfMessagesPublished"
26+
namespace = "AWS/SNS"
27+
period = var.event_publishing_anomaly_period # The period in seconds over which the specified statistic is applied for anomaly detection.
28+
stat = "Sum"
29+
30+
dimensions = {
31+
TopicName = aws_sns_topic.main.name
32+
}
33+
}
34+
}
35+
36+
metric_query {
37+
id = "ad1"
38+
expression = "ANOMALY_DETECTION_BAND(m1, ${var.event_publishing_anomaly_band_width})" # The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity.
39+
label = "NumberOfMessagesPublished (expected)"
40+
return_data = true
41+
}
42+
}

infrastructure/terraform/modules/eventpub/outputs.tf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,11 @@ output "s3_bucket_event_cache" {
1313
bucket = module.s3bucket_event_cache[0].bucket
1414
} : {}
1515
}
16+
17+
output "publishing_anomaly_alarm" {
18+
description = "CloudWatch anomaly detection alarm details for SNS publishing"
19+
value = var.enable_event_publishing_anomaly_detection ? {
20+
arn = aws_cloudwatch_metric_alarm.publishing_anomaly[0].arn
21+
name = aws_cloudwatch_metric_alarm.publishing_anomaly[0].alarm_name
22+
} : null
23+
}

infrastructure/terraform/modules/eventpub/variables.tf

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,27 @@ variable "additional_policies_for_event_cache_bucket" {
129129
description = "A list of JSON policies to use to build the bucket policy"
130130
default = []
131131
}
132+
133+
variable "enable_event_publishing_anomaly_detection" {
134+
type = bool
135+
description = "Enable CloudWatch anomaly detection alarm for SNS message publishing. Detects abnormal drops or spikes in event publishing volume."
136+
default = true
137+
}
138+
139+
variable "event_publishing_anomaly_evaluation_periods" {
140+
type = number
141+
description = "Number of evaluation periods for the publishing anomaly alarm. Each period is defined by event_publishing_anomaly_period."
142+
default = 3
143+
}
144+
145+
variable "event_publishing_anomaly_period" {
146+
type = number
147+
description = "The period in seconds over which the specified statistic is applied for anomaly detection. Minimum 300 seconds (5 minutes). Recommended: 300-600."
148+
default = 300
149+
}
150+
151+
variable "event_publishing_anomaly_band_width" {
152+
type = number
153+
description = "The width of the anomaly detection band. Higher values (e.g. 4-6) reduce sensitivity and noise, lower values (e.g. 2-3) increase sensitivity. Recommended: 2-4."
154+
default = 5
155+
}

0 commit comments

Comments
 (0)