fix(prometheus_rule): use instance name instead of hard coded value

!8
2024-02-04 20:05:28 +09:30
parent 73f25cfaa2
commit 39af78c6ea
2 changed files with 45 additions and 45 deletions
--- a/templates/PrometheusRule-alertmanager.yaml
+++ b/templates/PrometheusRule-alertmanager.yaml
@ -24,7 +24,7 @@ spec:
      expr: |
        # Without max_over_time, failed scrapes could create false negatives, see
        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
-        max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
+        max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) == 0
      for: 10m
      labels:
        severity: critical
@ -36,9 +36,9 @@ spec:
      expr: |
        # Without max_over_time, failed scrapes could create false negatives, see
        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
-          max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])
+          max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
        < on (namespace,service) group_left
-          count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]))
+          count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]))
      for: 15m
      labels:
        severity: critical
@ -49,9 +49,9 @@ spec:
        summary: An Alertmanager instance failed to send notifications.
      expr: |
        (
-          rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
+          rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
        /
-          rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
+          rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
        )
        > 0.01
      for: 5m
@ -64,9 +64,9 @@ spec:
        summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
      expr: |
        min by (namespace,service, integration) (
-          rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
+          rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m])
        /
-          rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
+          rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m])
        )
        > 0.01
      for: 5m
@ -79,9 +79,9 @@ spec:
        summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
      expr: |
        min by (namespace,service, integration) (
-          rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
+          rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m])
        /
-          rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
+          rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m])
        )
        > 0.01
      for: 5m
@ -94,7 +94,7 @@ spec:
        summary: Alertmanager instances within the same cluster have different configurations.
      expr: |
        count by (namespace,service) (
-          count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})
+          count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"})
        )
        != 1
      for: 20m
@ -108,11 +108,11 @@ spec:
      expr: |
        (
          count by (namespace,service) (
-            avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5
+            avg_over_time(up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) < 0.5
          )
        /
          count by (namespace,service) (
-            up{job="alertmanager-main",namespace="monitoring"}
+            up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}
          )
        )
        >= 0.5
@ -127,11 +127,11 @@ spec:
      expr: |
        (
          count by (namespace,service) (
-            changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4
+            changes(process_start_time_seconds{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[10m]) > 4
          )
        /
          count by (namespace,service) (
-            up{job="alertmanager-main",namespace="monitoring"}
+            up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}
          )
        )
        >= 0.5