| @ -24,7 +24,7 @@ spec: | ||||
|       expr: | | ||||
|         # Without max_over_time, failed scrapes could create false negatives, see | ||||
|         # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | ||||
|         max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 | ||||
|         max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) == 0 | ||||
|       for: 10m | ||||
|       labels: | ||||
|         severity: critical | ||||
| @ -36,9 +36,9 @@ spec: | ||||
|       expr: | | ||||
|         # Without max_over_time, failed scrapes could create false negatives, see | ||||
|         # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | ||||
|           max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) | ||||
|           max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) | ||||
|         < on (namespace,service) group_left | ||||
|           count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) | ||||
|           count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])) | ||||
|       for: 15m | ||||
|       labels: | ||||
|         severity: critical | ||||
| @ -49,9 +49,9 @@ spec: | ||||
|         summary: An Alertmanager instance failed to send notifications. | ||||
|       expr: | | ||||
|         ( | ||||
|           rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) | ||||
|           rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) | ||||
|         / | ||||
|           rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) | ||||
|           rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) | ||||
|         ) | ||||
|         > 0.01 | ||||
|       for: 5m | ||||
| @ -64,9 +64,9 @@ spec: | ||||
|         summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. | ||||
|       expr: | | ||||
|         min by (namespace,service, integration) ( | ||||
|           rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) | ||||
|           rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m]) | ||||
|         / | ||||
|           rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) | ||||
|           rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m]) | ||||
|         ) | ||||
|         > 0.01 | ||||
|       for: 5m | ||||
| @ -79,9 +79,9 @@ spec: | ||||
|         summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. | ||||
|       expr: | | ||||
|         min by (namespace,service, integration) ( | ||||
|           rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) | ||||
|           rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m]) | ||||
|         / | ||||
|           rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) | ||||
|           rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m]) | ||||
|         ) | ||||
|         > 0.01 | ||||
|       for: 5m | ||||
| @ -94,7 +94,7 @@ spec: | ||||
|         summary: Alertmanager instances within the same cluster have different configurations. | ||||
|       expr: | | ||||
|         count by (namespace,service) ( | ||||
|           count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) | ||||
|           count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}) | ||||
|         ) | ||||
|         != 1 | ||||
|       for: 20m | ||||
| @ -108,11 +108,11 @@ spec: | ||||
|       expr: | | ||||
|         ( | ||||
|           count by (namespace,service) ( | ||||
|             avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 | ||||
|             avg_over_time(up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) < 0.5 | ||||
|           ) | ||||
|         / | ||||
|           count by (namespace,service) ( | ||||
|             up{job="alertmanager-main",namespace="monitoring"} | ||||
|             up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"} | ||||
|           ) | ||||
|         ) | ||||
|         >= 0.5 | ||||
| @ -127,11 +127,11 @@ spec: | ||||
|       expr: | | ||||
|         ( | ||||
|           count by (namespace,service) ( | ||||
|             changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 | ||||
|             changes(process_start_time_seconds{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[10m]) > 4 | ||||
|           ) | ||||
|         / | ||||
|           count by (namespace,service) ( | ||||
|             up{job="alertmanager-main",namespace="monitoring"} | ||||
|             up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"} | ||||
|           ) | ||||
|         ) | ||||
|         >= 0.5 | ||||
|  | ||||
		Reference in New Issue
	
	Block a user