-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogservice.alerts.yml
118 lines (107 loc) · 5.62 KB
/
logservice.alerts.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
groups:
- name: logservice
rules:
- record: job:logs_sent_duration_sum:increase10m
expr: increase(logs_sent_duration_sum[10m])
- record: job:logs_sent_duration_count:increase10m
expr: increase(logs_sent_duration_count[10m])
- record: logs_capacity:rate_instance
expr: sum(rate(logs_sent_total[5m])) by (instance)
- record: logs_capacity:rate
expr: sum(logs_capacity:rate_instance)
- record: logs_capacity:rate_max24h
expr: max_over_time(logs_capacity:rate[24h])
- record: logs_capacity:instance_count
expr: count(logs_capacity:rate_instance)
- record: logs_capacity:rate_max
expr: |
(logs_capacity:instance_count - <%= p('logservice_alerts.capacity.rolling_instance') %>)
*
<%= p('logservice_alerts.capacity.instance_max_rate') %>
- record: logs_capacity:redundancy_factor
expr: <%= p('logservice_alerts.capacity.redundancy_factor') %> / 100
- record: logs_capacity:threshold
expr: <%= p('logservice_alerts.capacity.threshold') %>
- record: logs_capacity:rate_remaining
expr: |
logs_capacity:rate_max
-
(logs_capacity:rate_max24h * (1 + logs_capacity:redundancy_factor))
- alert: LogserviceCapacityTooLow
expr: logs_capacity:rate_remaining < logs_capacity:threshold
for: 5m
labels:
service: logservice
severity: warning
annotations:
summary: "Logservice doesn't have enough instances to process logs safely"
description: |
Logservice needs scale-up. It has not enough instances to process all logs while
undergoing simultaneously a cluster failure and a rolling update.
Impact:
- chances of loosing application logs in case of cluster failure or maintenance
Resolution:
- contact administrator to scale up the component
- alert: LogserviceDurationTooHigh
expr: |
quantile(0.5,
(job:logs_sent_duration_sum:increase10m / job:logs_sent_duration_count:increase10m) > 0
) by (instance) * 1000 > <%= p('logservice_alerts.duration.threshold_ms') %>
for: <%= p('logservice_alerts.duration.evaluation_time') %>
labels:
service: logservice
severity: warning
annotations:
summary: "Logservice has high latency at instance `{{$labels.instance}}`"
description: "Average process time of logs is greater than `<%= p('logservice_alerts.duration.threshold_ms') %>ms` at instance `{{$labels.instance}}` during the last <%= p('logservice_alerts.duration.evaluation_time') %>"
- alert: LogserviceDropRateTooHigh
expr: sum(rate(logs_sent_errors_total{<%= p('logservice_alerts.errors.filter') %>}[10m])) by (instance, org, space, app) > <%= p('logservice_alerts.errors.threshold') %>
for: <%= p('logservice_alerts.errors.evaluation_time') %>
labels:
service: logservice
severity: critical
annotations:
summary: "Logservice is dropping some logs at instance `{{$labels.instance}}` for app at `{{$labels.org}}/{{$labels.space}}/{{$labels.app}}`"
description: "Logservice is dropping more than `<%= p('logservice_alerts.errors.threshold') %>` logs/s at instance `{{$labels.instance}}` for app at `{{$labels.org}}/{{$labels.space}}/{{$labels.app}}` during the last <%= p('logservice_alerts.errors.evaluation_time') %>"
- alert: LogserviceNoLogs
expr: (sum(increase(logs_sent_total[<%= p('logservice_alerts.nologs.span_time') %>])) or vector(0)) == 0
for: <%= p('logservice_alerts.nologs.evaluation_time') %>
labels:
service: logservice
severity: critical
annotations:
summary: "Logservice reports no forwarded logs for the past <%= p('logservice_alerts.nologs.evaluation_time') %> minutes"
description: |
No logs were forwarded by logservice in the last <%= p('logservice_alerts.nologs.evaluation_time') %> minutes
This should not impact customers because applications are not affected but
it's still a serious failure since produced logs are definitely lost.
Please contact Cloud Foundry admin team
- alert: LogserviceDBFailure
expr: logs_db_status != 1
for: 5m
labels:
service: logservice
severity: critical
annotations:
summary: "Logservice reports communication failure with database"
description: |
Database could not be reached for more than 5m
This is a serious failure since produced logs are definitely lost.
Please contact Cloud Foundry admin team
<% if p('logservice_alerts.cache.enable') %>
- alert: LogserviceSendingLogsWithoutUsingCache
expr: (sum(increase(logs_sent_without_cache_total[5m])) by (org, space, app) or vector(0)) > <%= p('logservice_alerts.cache.threshold') %>
for: <%= p('logservice_alerts.cache.evaluation_time') %>
labels:
service: logservice
severity: critical
annotations:
summary: "Logservice reports that app `{{$labels.org}}/{{$labels.space}}/{{$labels.app}}` is sending logs without using cache during the last <%= p('logservice_alerts.cache.evaluation_time') %>"
description: |
App `{{$labels.org}}/{{$labels.space}}/{{$labels.app}}` is sending too much logs without using cache.
This must be an issue with logservice.
Please contact Cloud Foundry admin team
<% end %>
# Local Variables:
# ispell-local-dictionary: "american"
# End: