2
0
mirror of https://github.com/nofusscomputing/kubernetes.git synced 2025-08-02 04:22:42 +00:00

feat(alert-manager): Add manifests

ref: #5
This commit is contained in:
2025-06-11 23:30:46 +09:30
parent a4a2348ea4
commit 03c5fce8ff
14 changed files with 440 additions and 1 deletions

View File

@ -0,0 +1,29 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
labels:
app.kubernetes.io/component: alerting
name: cluster
spec:
externalUrl: alert-manager.local
# image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}"
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: alerting
replicas: 1
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager
version: v0.27.0

View File

@ -0,0 +1,18 @@
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: alerting
name: cluster
spec:
ports:
- name: web
port: 9093
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
app.kubernetes.io/component: alerting
sessionAffinity: ClientIP

View File

@ -0,0 +1,8 @@
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: alerting
name: alertmanager

View File

@ -0,0 +1,49 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: alert
labels:
- includeSelectors: true
pairs:
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: alertmanager
resources:
# - Secret-alertmanager.yaml
- ServiceAccount-alertmanager.yaml
- AlertManager-cluster.yaml
- Service-alertmanager.yaml
# - PrometheusRule-alertmanager.yaml
# - serviceMonitor-alertmanager.yaml
patches:
- target:
kind: Alertmanager
name: cluster
patch: |-
- op: replace
path: /spec/version
value: v0.27.0
- op: replace
path: /spec/externalUrl
value: alert-manager.local
replacements:
- source:
kind: Alertmanager
name: cluster
fieldPath: metadata.labels
targets:
- select:
kind: Alertmanager
name: cluster
fieldPaths:
- spec.podMetadata.labels

View File

@ -0,0 +1,17 @@
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
labels:
app.kubernetes.io/component: alerting
name: alertmanager
spec:
allowCrossNamespaceImport: true
folder: "General"
resyncPeriod: 24h
instanceSelector:
matchLabels:
dashboards: grafana
grafanaCom:
id: 9578
revision: 4 # as @ 19-09-23

View File

@ -0,0 +1,7 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- GrafanaDashboard-AlertManager.yaml

View File

@ -0,0 +1,34 @@
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
cert-manager.io/common-name: alert-manager.local
cert-manager.io/duration: 2160h
cert-manager.io/private-key-algorithm: ECDSA
cert-manager.io/private-key-rotation-policy: Always
cert-manager.io/private-key-size: "384"
cert-manager.io/subject-countries: N/A
cert-manager.io/subject-organizationalunits: N/A
cert-manager.io/subject-organizations: N/A
cert-manager.io/subject-provinces: N/A
labels:
app.kubernetes.io/component: alerting
name: alert-manager
spec:
ingressClassName: nginx
rules:
- host: alert-manager.local
http:
paths:
- backend:
service:
name: cluster
port:
name: web
path: /
pathType: Prefix
tls:
- hosts:
- alert-manager.local
secretName: certificate-tls-alert-manager

View File

@ -0,0 +1,36 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- Ingress-alert-manager.yaml
#############################
# Items to Configure
#############################
# patches:
# - patch: |-
# - op: replace
# path: /metadata/annotations/cert-manager.io~1cluster-issuer
# value: cluster
# - op: replace
# path: /metadata/annotations/cert-manager.io~1common-name
# value: alert-manager.local
# - op: replace
# path: /spec/rules/0/host
# value: alert-manager.local
# - op: replace
# path: /spec/tls/0/hosts/0
# value: alert-manager.local
# target:
# kind: Ingress
# name: alert-manager
# version: v1

View File

@ -0,0 +1,30 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: alerting
name: alertmanager
spec:
endpoints:
- interval: 30s
port: web
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_name
targetLabel: instance
- interval: 30s
port: reloader-web
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_name
targetLabel: instance
selector:
matchLabels:
app.kubernetes.io/component: alerting

View File

@ -0,0 +1,7 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- ServiceMonitor-alertmanager.yaml

View File

@ -0,0 +1,133 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: alerting
name: alertmanager-main-rules
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager",namespace="monitoring"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{ $labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager",namespace="monitoring"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager",namespace="monitoring"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager",namespace="monitoring"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{ $labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager",namespace="monitoring", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager",namespace="monitoring", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{ $labels.job}} cluster is {{ $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager",namespace="monitoring", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager",namespace="monitoring", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{ $labels.job}} cluster have different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager",namespace="monitoring"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: "{{ $value | humanizePercentage }} of Alertmanager instances within the {{ $labels.job}} cluster have been up for less than half of the last 5m."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |
(
count by (namespace,service) (
avg_over_time(up{job="alertmanager",namespace="monitoring"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="alertmanager",namespace="monitoring"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: "{{ $value | humanizePercentage }} of Alertmanager instances within the {{ $labels.job}} cluster have restarted at least 5 times in the last 10m."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |
(
count by (namespace,service) (
changes(process_start_time_seconds{job="alertmanager",namespace="monitoring"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="alertmanager",namespace="monitoring"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@ -0,0 +1,7 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- PrometheusRule-alertmanager.yaml

View File

@ -0,0 +1,64 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: metrics
labels:
- includeSelectors: true
pairs:
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: alertmanager
patches:
- target:
kind: Alertmanager
name: cluster
patch: |-
- op: replace
path: /spec/version
value: v0.27.0
- op: replace
path: /spec/externalUrl
value: alert-manager.local
#
# Ingress Setup
#
# - patch: |-
# - op: replace
# path: /metadata/annotations/cert-manager.io~1cluster-issuer
# value: cluster
# - op: replace
# path: /metadata/annotations/cert-manager.io~1common-name
# value: prometheus.local
# - op: replace
# path: /spec/rules/0/host
# value: prometheus.local
# - op: replace
# path: /spec/tls/0/hosts/0
# value: prometheus.local
# target:
# kind: Ingress
# name: prometheus
# version: v1
resources:
- ../../base
components:
- ../../components/dashboard
- ../../components/ingress
# - ../../components/metrics
# - ../../components/prometheus-rules

View File

@ -48,4 +48,4 @@ components:
# - ../../components/alertmanager
- ../../components/prometheus-rules
- ../../components/ingress
- ../../components/grafana-datasource
# - ../../components/grafana-datasource