chore: migrate chart to public repo #8
39
templates/AlertManager-k8s.yaml
Normal file
39
templates/AlertManager-k8s.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Alertmanager
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
name: main
|
||||
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||
spec:
|
||||
image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}"
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
podMetadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
replicas: 3
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
requests:
|
||||
cpu: 4m
|
||||
memory: 100Mi
|
||||
securityContext:
|
||||
fsGroup: 2000
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
serviceAccountName: alertmanager-main
|
||||
version: 0.25.0
|
21
templates/PodDisruptionBudget-alertmanager.yaml
Normal file
21
templates/PodDisruptionBudget-alertmanager.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
---
|
||||
apiVersion: policy/v1
|
||||
kind: PodDisruptionBudget
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
name: alertmanager-main
|
||||
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||
spec:
|
||||
maxUnavailable: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
141
templates/PrometheusRule-alertmanager.yaml
Normal file
141
templates/PrometheusRule-alertmanager.yaml
Normal file
@ -0,0 +1,141 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: alertmanager-main-rules
|
||||
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
||||
summary: Reloading an Alertmanager configuration has failed.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
||||
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
< on (namespace,service) group_left
|
||||
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]))
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerFailedToSendAlerts
|
||||
annotations:
|
||||
description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
||||
summary: An Alertmanager instance failed to send notifications.
|
||||
expr: |
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||
expr: |
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||
annotations:
|
||||
description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||
expr: |
|
||||
min by (namespace,service, integration) (
|
||||
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
||||
summary: Alertmanager instances within the same cluster have different configurations.
|
||||
expr: |
|
||||
count by (namespace,service) (
|
||||
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})
|
||||
)
|
||||
!= 1
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterDown
|
||||
annotations:
|
||||
description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||
expr: |
|
||||
(
|
||||
count by (namespace,service) (
|
||||
avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="alertmanager-main",namespace="monitoring"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: AlertmanagerClusterCrashlooping
|
||||
annotations:
|
||||
description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
||||
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||
expr: |
|
||||
(
|
||||
count by (namespace,service) (
|
||||
changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (namespace,service) (
|
||||
up{job="alertmanager-main",namespace="monitoring"}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
61
templates/Secret-alertmanager.yaml
Normal file
61
templates/Secret-alertmanager.yaml
Normal file
@ -0,0 +1,61 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
name: alertmanager-main
|
||||
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||
stringData:
|
||||
alertmanager.yaml: |-
|
||||
"global":
|
||||
"resolve_timeout": "5m"
|
||||
"inhibit_rules":
|
||||
- "equal":
|
||||
- "namespace"
|
||||
- "alertname"
|
||||
"source_matchers":
|
||||
- "severity = critical"
|
||||
"target_matchers":
|
||||
- "severity =~ warning|info"
|
||||
- "equal":
|
||||
- "namespace"
|
||||
- "alertname"
|
||||
"source_matchers":
|
||||
- "severity = warning"
|
||||
"target_matchers":
|
||||
- "severity = info"
|
||||
- "equal":
|
||||
- "namespace"
|
||||
"source_matchers":
|
||||
- "alertname = InfoInhibitor"
|
||||
"target_matchers":
|
||||
- "severity = info"
|
||||
"receivers":
|
||||
- "name": "Default"
|
||||
- "name": "Watchdog"
|
||||
- "name": "Critical"
|
||||
- "name": "null"
|
||||
"route":
|
||||
"group_by":
|
||||
- "namespace"
|
||||
"group_interval": "5m"
|
||||
"group_wait": "30s"
|
||||
"receiver": "Default"
|
||||
"repeat_interval": "12h"
|
||||
"routes":
|
||||
- "matchers":
|
||||
- "alertname = Watchdog"
|
||||
"receiver": "Watchdog"
|
||||
- "matchers":
|
||||
- "alertname = InfoInhibitor"
|
||||
"receiver": "null"
|
||||
- "matchers":
|
||||
- "severity = critical"
|
||||
"receiver": "Critical"
|
||||
type: Opaque
|
27
templates/Service-alertmanager.yaml
Normal file
27
templates/Service-alertmanager.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
name: alertmanager-main
|
||||
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||
spec:
|
||||
ports:
|
||||
- name: web
|
||||
port: 9093
|
||||
targetPort: web
|
||||
- name: reloader-web
|
||||
port: 8080
|
||||
targetPort: reloader-web
|
||||
selector:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
sessionAffinity: ClientIP
|
14
templates/ServiceAccount-alertmanager.yaml
Normal file
14
templates/ServiceAccount-alertmanager.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
automountServiceAccountToken: false
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
name: alertmanager-main
|
||||
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
29
templates/serviceMonitor-alertmanager.yaml
Normal file
29
templates/serviceMonitor-alertmanager.yaml
Normal file
@ -0,0 +1,29 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
name: alertmanager
|
||||
namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}"
|
||||
|
||||
spec:
|
||||
endpoints:
|
||||
- interval: 30s
|
||||
port: web
|
||||
- interval: 30s
|
||||
port: reloader-web
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- "{{ .Values.nfc_monitoring.alert_manager.namespace }}"
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: alert-router
|
||||
app.kubernetes.io/instance: main
|
||||
app.kubernetes.io/name: alertmanager
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
Reference in New Issue
Block a user