Skip to content

Commit b023ab0

Browse files
Add container net alert (#688)
* remove dock and add prometheus * remove stage hosts * fix pd key location * update alert params
1 parent 0a720a8 commit b023ab0

13 files changed

Lines changed: 19 additions & 164 deletions

File tree

ansible/group_vars/all.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ github_protocol: https
55

66
# ops
77
ops_slack_channel_url: https://hooks.slack.com/services/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
8-
pager_duty_key: testkey
98
environment_root: "../environments/{{ env }}/"
109
opts_root: "{{ environment_root }}/k8/{{ name }}"
1110
secrets_root: "{{ environment_root }}/secrets/"
@@ -41,8 +40,6 @@ dockerfile: basic_node/Dockerfile
4140
docker_network: 172.17.0.0/16
4241
base_dockerfile: node_base
4342

44-
docker_config: runnable
45-
4643
# slack rooms to send notifications
4744
slack_token: T029DEC10/B1RSX8LNS/qLLSYEEqkGddohOdE44eDf3j
4845
slack_channels: [ '#ops' ]

ansible/group_vars/alpha-dock-init.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@ name: dock-init
22
app_name: "{{ name }}"
33
app_repo: git@github.com:CodeNow/{{ name }}.git
44

5-
# for docker role
6-
docker_config: dock
7-
85
# consul values
96
consul_values:
107
- key: "{{ name }}/version"

ansible/group_vars/alpha-prometheus-alerts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name: prometheus-alerts
22

33
container_image: prom/alertmanager
4-
container_tag: v0.5.0
4+
container_tag: v0.6.2
55
hosted_ports: ["{{ prometheus_alert_port }}"]
66

77
memory_hard_limit: 10G

ansible/roles/datadog/tasks/main.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@
4949

5050
- name: install network checks for dock services
5151
become: true
52-
when: docker_config == "docks"
5352
template:
5453
src=tcp_check.yaml.j2
5554
dest=/etc/dd-agent/conf.d/tcp_check.yaml

ansible/roles/docker/tasks/main.yml

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,6 @@
1515
owner=root
1616
group=root
1717

18-
- name: create core file dir
19-
become: true
20-
when: docker_config == "runnable" and core_file_dir != "/var/log"
21-
tags: coreDump
22-
file:
23-
path="{{ core_file_dir }}"
24-
state=directory
25-
owner=root
26-
group=root
27-
mode=0755
28-
29-
- name: change core dump path
30-
become: true
31-
when: docker_config == "runnable"
32-
tags: coreDump
33-
sysctl:
34-
name=kernel.core_pattern
35-
reload=true
36-
state=present
37-
value="{{ core_file_dir }}/core.%h.%e.%t"
38-
3918
- name: install aufs with linux-image-extra-{{ ansible_kernel }}
4019
become: true
4120
tags: aufs
@@ -70,7 +49,7 @@
7049
- name: copy docker config file
7150
become: true
7251
template:
73-
src={{ docker_config }}
52+
src=dock
7453
dest=/etc/default/docker
7554
register: copied_config
7655
tags: genDockerConfig

ansible/roles/prometheus-alerts/templates/prometheus-alerts.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,11 @@ data:
1111
1212
slack_api_url: "{{ ops_slack_channel_url }}"
1313
14-
pagerduty_url: "https://events.pagerduty.com/generic/2010-04-15/create_event.json"
15-
1614
# The root node of the routing tree.
1715
route:
1816
# A default receiver
1917
receiver: slack
2018
21-
continue: true
22-
2319
routes:
2420
- match_re:
2521
reportTo: .*pagerduty.*

ansible/roles/prometheus/files/alerts.conf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,15 @@ ALERT DockHighLoad
8888
summary = "({{ $labels.env }}) Dock is experiencing high load host={{ $labels.hostIp }} labels={{ $labels }}",
8989
description = "ssh {{ $labels.hostIp }} into dock make sure it is responsive, if it is not, unhealthy. `docks unhealthy -e delta {{ $labels.hostIp }}`"
9090
}
91+
92+
ALERT ContainerUsingTooMuchNetwork
93+
IF container_network_transmit_bytes_total{interface="eth0",container_label_type="user-container"} / 1000000 > 200
94+
FOR 30m
95+
LABELS {
96+
reportTo = "pagerduty"
97+
}
98+
ANNOTATIONS {
99+
summary = "({{ $labels.env }}) container is using to much network id={{ $labels.id }} org={{ $labels.githubOrgId }} org name: {{ $labels.container_label_ownerUsername }} ",
100+
description = "ssh {{ $labels.hostIp }} and docker rm and docker kill offending container {{ $labels.id }}"
101+
}
102+

ansible/roles/prometheus/templates/prometheus.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ data:
5656
- source_labels: [__meta_ec2_tag_env]
5757
target_label: env
5858
59-
# pulls server list from ec2 and drops all servers that are not production gamma or a dock
59+
# pulls server list from ec2 and drops all servers that are not in env or a dock
6060
- job_name: server_info
6161
# keys to access this region and port of prom
6262
ec2_sd_configs:

ansible/stage-hosts/docks.js

Lines changed: 0 additions & 75 deletions
This file was deleted.

ansible/stage-hosts/hosts

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)