-
Notifications
You must be signed in to change notification settings - Fork 168
/
Copy pathrabbitmq.alerts.yml
66 lines (66 loc) · 3.48 KB
/
rabbitmq.alerts.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
groups:
- name: rabbitmq
rules:
- alert: RabbitMQServerDown
expr: min(rabbitmq_up) by(instance) != 1
for: <%= p('rabbitmq_alerts.server_down.evaluation_time') %>
labels:
service: rabbitmq
severity: critical
annotations:
summary: "Rabbitmq Server instance `{{$labels.instance}}` is down"
description: "The RabbitMQ Server instance at `{{$labels.instance}}` has been down the last <%= p('rabbitmq_alerts.server_down.evaluation_time') %>"
- alert: RabbitMqClusterNotAllNodesRunning
expr: (sum(rabbitmq_up{job=~".*rabbitmq.*"}) by (job)) < (sum(up{job=~".*rabbitmq.*"}) by (job))
for: <%= p('rabbitmq_alerts.rabbitmq_up.evaluation_time') %>
labels:
job: rabbitmq
severity: critical
annotations:
description: Some RabbitMQ Cluster Nodes Are Down in job {{`{{ $labels.instance }}`}}/{{`{{ $labels.job}}`}}
summary: Some RabbitMQ Cluster Nodes Are Down in job {{`{{ $labels.instance }}`}}/{{`{{ $labels.job}}`}}
- alert: RabbitMqDiskSpaceAlarm
expr: rabbitmq_node_disk_free_alarm{job=~".*rabbitmq.*"} == 1
for: <%= p('rabbitmq_alerts.rabbitmq_node_disk_free_alarm.evaluation_time') %>
labels:
job: rabbitmq
severity: critical
annotations:
description: RabbitMQ {{`{{ $labels.instance }}`}} Disk Space Alarm is going off. Which means the node hit highwater mark and has cut off network connectivity, see RabbitMQ WebUI
summary: RabbitMQ is Out of Disk Space
- alert: RabbitMqMemoryAlarm
expr: rabbitmq_node_mem_alarm{job=~".*rabbitmq.*"} == 1
for: <%= p('rabbitmq_alerts.rabbitmq_node_mem_alarm.evaluation_time') %>
labels:
job: rabbitmq
severity: critical
annotations:
description: RabbitMQ {{`{{ $labels.instance }}`}} High Memory Alarm is going off. Which means the node hit highwater mark and has cut off network connectivity, see RabbitMQ WebUI
summary: RabbitMQ is Out of Memory
- alert: RabbitMqMemoryUsageHigh
expr: (rabbitmq_node_mem_used{job=~".*rabbitmq.*"} / rabbitmq_node_mem_limit{job=~".*rabbitmq.*"}) > .9
for: <%= p('rabbitmq_alerts.rabbitmq_node_mem_used.evaluation_time') %>
labels:
job: rabbitmq
severity: critical
annotations:
description: RabbitMQ {{`{{ $labels.instance }}`}} Memory Usage > 90%
summary: RabbitMQ Node > 90% Memory Usage
- alert: RabbitMqFileDescriptorsLow
expr: (rabbitmq_fd_used{job=~".*rabbitmq.*"} / rabbitmq_fd_total{job=~".*rabbitmq.*"}) > .9
for: <%= p('rabbitmq_alerts.rabbitmq_fd_used.evaluation_time') %>
labels:
job: rabbitmq
severity: critical
annotations:
description: RabbitMQ {{`{{ $labels.instance }}`}} File Descriptor Usage > 90%
summary: RabbitMQ Low File Descriptor Available
- alert: RabbitMqDiskSpaceLow
expr: predict_linear(rabbitmq_node_disk_free{job=~".*rabbitmq.*"}[15m], 1 * 60 * 60) < rabbitmq_node_disk_free_limit{job=~".*rabbitmq.*"}
for: <%= p('rabbitmq_alerts.rabbitmq_node_disk_free.evaluation_time') %>
labels:
job: rabbitmq
severity: critical
annotations:
description: RabbitMQ {{`{{ $labels.instance }}`}} will hit disk limit in the next hr based on last 15 mins trend.
summary: RabbitMQ is Low on Disk Space and will Run Out in the next hour