From 534377edbdafab9c3939288330fc3e845445c898 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:14:29 +0930 Subject: [PATCH] feat: alert manager deployment !1 --- templates/AlertManager-k8s.yaml | 39 +++++ .../PodDisruptionBudget-alertmanager.yaml | 21 +++ templates/PrometheusRule-alertmanager.yaml | 141 ++++++++++++++++++ templates/Secret-alertmanager.yaml | 61 ++++++++ templates/Service-alertmanager.yaml | 27 ++++ templates/ServiceAccount-alertmanager.yaml | 14 ++ templates/serviceMonitor-alertmanager.yaml | 29 ++++ 7 files changed, 332 insertions(+) create mode 100644 templates/AlertManager-k8s.yaml create mode 100644 templates/PodDisruptionBudget-alertmanager.yaml create mode 100644 templates/PrometheusRule-alertmanager.yaml create mode 100644 templates/Secret-alertmanager.yaml create mode 100644 templates/Service-alertmanager.yaml create mode 100644 templates/ServiceAccount-alertmanager.yaml create mode 100644 templates/serviceMonitor-alertmanager.yaml diff --git a/templates/AlertManager-k8s.yaml b/templates/AlertManager-k8s.yaml new file mode 100644 index 0000000..1c2f654 --- /dev/null +++ b/templates/AlertManager-k8s.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + labels: + app.kubernetes.io/instance: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}" + nodeSelector: + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + replicas: 3 + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 4m + memory: 100Mi + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: alertmanager-main + version: 0.25.0 diff --git a/templates/PodDisruptionBudget-alertmanager.yaml b/templates/PodDisruptionBudget-alertmanager.yaml new file mode 100644 index 0000000..43ed6fe --- /dev/null +++ b/templates/PodDisruptionBudget-alertmanager.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/PrometheusRule-alertmanager.yaml b/templates/PrometheusRule-alertmanager.yaml new file mode 100644 index 0000000..988dad9 --- /dev/null +++ b/templates/PrometheusRule-alertmanager.yaml @@ -0,0 +1,141 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: alertmanager-main-rules + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/templates/Secret-alertmanager.yaml b/templates/Secret-alertmanager.yaml new file mode 100644 index 0000000..4e710be --- /dev/null +++ b/templates/Secret-alertmanager.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +stringData: + alertmanager.yaml: |- + "global": + "resolve_timeout": "5m" + "inhibit_rules": + - "equal": + - "namespace" + - "alertname" + "source_matchers": + - "severity = critical" + "target_matchers": + - "severity =~ warning|info" + - "equal": + - "namespace" + - "alertname" + "source_matchers": + - "severity = warning" + "target_matchers": + - "severity = info" + - "equal": + - "namespace" + "source_matchers": + - "alertname = InfoInhibitor" + "target_matchers": + - "severity = info" + "receivers": + - "name": "Default" + - "name": "Watchdog" + - "name": "Critical" + - "name": "null" + "route": + "group_by": + - "namespace" + "group_interval": "5m" + "group_wait": "30s" + "receiver": "Default" + "repeat_interval": "12h" + "routes": + - "matchers": + - "alertname = Watchdog" + "receiver": "Watchdog" + - "matchers": + - "alertname = InfoInhibitor" + "receiver": "null" + - "matchers": + - "severity = critical" + "receiver": "Critical" +type: Opaque diff --git a/templates/Service-alertmanager.yaml b/templates/Service-alertmanager.yaml new file mode 100644 index 0000000..974e77e --- /dev/null +++ b/templates/Service-alertmanager.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + ports: + - name: web + port: 9093 + targetPort: web + - name: reloader-web + port: 8080 + targetPort: reloader-web + selector: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-alertmanager.yaml b/templates/ServiceAccount-alertmanager.yaml new file mode 100644 index 0000000..f8c73b3 --- /dev/null +++ b/templates/ServiceAccount-alertmanager.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} diff --git a/templates/serviceMonitor-alertmanager.yaml b/templates/serviceMonitor-alertmanager.yaml new file mode 100644 index 0000000..c7f08fc --- /dev/null +++ b/templates/serviceMonitor-alertmanager.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager + namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + +spec: + endpoints: + - interval: 30s + port: web + - interval: 30s + port: reloader-web + namespaceSelector: + matchNames: + - "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }}