fix(prometheus_rule): use instance name instead of hard coded value

!8
This commit is contained in:
2024-02-04 20:05:28 +09:30
parent 73f25cfaa2
commit 39af78c6ea
2 changed files with 45 additions and 45 deletions

View File

@ -24,7 +24,7 @@ spec:
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) == 0
for: 10m
labels:
severity: critical
@ -36,9 +36,9 @@ spec:
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])
max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]))
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: critical
@ -49,9 +49,9 @@ spec:
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
)
> 0.01
for: 5m
@ -64,9 +64,9 @@ spec:
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m])
)
> 0.01
for: 5m
@ -79,9 +79,9 @@ spec:
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m])
)
> 0.01
for: 5m
@ -94,7 +94,7 @@ spec:
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"})
)
!= 1
for: 20m
@ -108,11 +108,11 @@ spec:
expr: |
(
count by (namespace,service) (
avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5
avg_over_time(up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="monitoring"}
up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}
)
)
>= 0.5
@ -127,11 +127,11 @@ spec:
expr: |
(
count by (namespace,service) (
changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4
changes(process_start_time_seconds{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="alertmanager-main",namespace="monitoring"}
up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}
)
)
>= 0.5