mirror of
https://github.com/nofusscomputing/kubernetes.git
synced 2025-08-02 04:22:42 +00:00
29
manifests/alert-manager/base/AlertManager-cluster.yaml
Normal file
29
manifests/alert-manager/base/AlertManager-cluster.yaml
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: Alertmanager
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: cluster
|
||||||
|
spec:
|
||||||
|
externalUrl: alert-manager.local
|
||||||
|
# image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}"
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
podMetadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
replicas: 1
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
requests:
|
||||||
|
cpu: 4m
|
||||||
|
memory: 100Mi
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 2000
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
serviceAccountName: alertmanager
|
||||||
|
version: v0.27.0
|
18
manifests/alert-manager/base/Service-alertmanager.yaml
Normal file
18
manifests/alert-manager/base/Service-alertmanager.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: cluster
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- name: web
|
||||||
|
port: 9093
|
||||||
|
targetPort: web
|
||||||
|
- name: reloader-web
|
||||||
|
port: 8080
|
||||||
|
targetPort: reloader-web
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
sessionAffinity: ClientIP
|
@ -0,0 +1,8 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: alertmanager
|
49
manifests/alert-manager/base/kustomization.yaml
Normal file
49
manifests/alert-manager/base/kustomization.yaml
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
---
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
|
||||||
|
namespace: alert
|
||||||
|
|
||||||
|
labels:
|
||||||
|
- includeSelectors: true
|
||||||
|
pairs:
|
||||||
|
app.kubernetes.io/instance: cluster
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: alertmanager
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
# - Secret-alertmanager.yaml
|
||||||
|
- ServiceAccount-alertmanager.yaml
|
||||||
|
- AlertManager-cluster.yaml
|
||||||
|
- Service-alertmanager.yaml
|
||||||
|
# - PrometheusRule-alertmanager.yaml
|
||||||
|
# - serviceMonitor-alertmanager.yaml
|
||||||
|
|
||||||
|
|
||||||
|
patches:
|
||||||
|
- target:
|
||||||
|
kind: Alertmanager
|
||||||
|
name: cluster
|
||||||
|
patch: |-
|
||||||
|
- op: replace
|
||||||
|
path: /spec/version
|
||||||
|
value: v0.27.0
|
||||||
|
|
||||||
|
- op: replace
|
||||||
|
path: /spec/externalUrl
|
||||||
|
value: alert-manager.local
|
||||||
|
|
||||||
|
|
||||||
|
replacements:
|
||||||
|
- source:
|
||||||
|
kind: Alertmanager
|
||||||
|
name: cluster
|
||||||
|
fieldPath: metadata.labels
|
||||||
|
targets:
|
||||||
|
- select:
|
||||||
|
kind: Alertmanager
|
||||||
|
name: cluster
|
||||||
|
fieldPaths:
|
||||||
|
- spec.podMetadata.labels
|
@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: alertmanager
|
||||||
|
spec:
|
||||||
|
allowCrossNamespaceImport: true
|
||||||
|
folder: "General"
|
||||||
|
resyncPeriod: 24h
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
dashboards: grafana
|
||||||
|
grafanaCom:
|
||||||
|
id: 9578
|
||||||
|
revision: 4 # as @ 19-09-23
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||||
|
kind: Component
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- GrafanaDashboard-AlertManager.yaml
|
@ -0,0 +1,34 @@
|
|||||||
|
---
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
cert-manager.io/common-name: alert-manager.local
|
||||||
|
cert-manager.io/duration: 2160h
|
||||||
|
cert-manager.io/private-key-algorithm: ECDSA
|
||||||
|
cert-manager.io/private-key-rotation-policy: Always
|
||||||
|
cert-manager.io/private-key-size: "384"
|
||||||
|
cert-manager.io/subject-countries: N/A
|
||||||
|
cert-manager.io/subject-organizationalunits: N/A
|
||||||
|
cert-manager.io/subject-organizations: N/A
|
||||||
|
cert-manager.io/subject-provinces: N/A
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: alert-manager
|
||||||
|
spec:
|
||||||
|
ingressClassName: nginx
|
||||||
|
rules:
|
||||||
|
- host: alert-manager.local
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- backend:
|
||||||
|
service:
|
||||||
|
name: cluster
|
||||||
|
port:
|
||||||
|
name: web
|
||||||
|
path: /
|
||||||
|
pathType: Prefix
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- alert-manager.local
|
||||||
|
secretName: certificate-tls-alert-manager
|
@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||||
|
kind: Component
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- Ingress-alert-manager.yaml
|
||||||
|
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# Items to Configure
|
||||||
|
#############################
|
||||||
|
|
||||||
|
# patches:
|
||||||
|
# - patch: |-
|
||||||
|
# - op: replace
|
||||||
|
# path: /metadata/annotations/cert-manager.io~1cluster-issuer
|
||||||
|
# value: cluster
|
||||||
|
|
||||||
|
# - op: replace
|
||||||
|
# path: /metadata/annotations/cert-manager.io~1common-name
|
||||||
|
# value: alert-manager.local
|
||||||
|
|
||||||
|
# - op: replace
|
||||||
|
# path: /spec/rules/0/host
|
||||||
|
# value: alert-manager.local
|
||||||
|
|
||||||
|
# - op: replace
|
||||||
|
# path: /spec/tls/0/hosts/0
|
||||||
|
# value: alert-manager.local
|
||||||
|
|
||||||
|
# target:
|
||||||
|
# kind: Ingress
|
||||||
|
# name: alert-manager
|
||||||
|
# version: v1
|
||||||
|
|
@ -0,0 +1,30 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: alertmanager
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- interval: 30s
|
||||||
|
port: web
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_name
|
||||||
|
targetLabel: instance
|
||||||
|
- interval: 30s
|
||||||
|
port: reloader-web
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_name
|
||||||
|
targetLabel: instance
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: alerting
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||||
|
kind: Component
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- ServiceMonitor-alertmanager.yaml
|
@ -0,0 +1,133 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alerting
|
||||||
|
name: alertmanager-main-rules
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: alertmanager.rules
|
||||||
|
rules:
|
||||||
|
- alert: AlertmanagerFailedReload
|
||||||
|
annotations:
|
||||||
|
description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
||||||
|
summary: Reloading an Alertmanager configuration has failed.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager",namespace="monitoring"}[5m]) == 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerMembersInconsistent
|
||||||
|
annotations:
|
||||||
|
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{ $labels.job}} cluster.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
||||||
|
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(alertmanager_cluster_members{job="alertmanager",namespace="monitoring"}[5m])
|
||||||
|
< on (namespace,service) group_left
|
||||||
|
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager",namespace="monitoring"}[5m]))
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerFailedToSendAlerts
|
||||||
|
annotations:
|
||||||
|
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
||||||
|
summary: An Alertmanager instance failed to send notifications.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(alertmanager_notifications_failed_total{job="alertmanager",namespace="monitoring"}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{job="alertmanager",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||||
|
annotations:
|
||||||
|
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{ $labels.job}} cluster is {{ $value | humanizePercentage }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||||
|
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||||
|
expr: |
|
||||||
|
min by (namespace,service, integration) (
|
||||||
|
rate(alertmanager_notifications_failed_total{job="alertmanager",namespace="monitoring", integration=~`.*`}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{job="alertmanager",namespace="monitoring", integration=~`.*`}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||||
|
annotations:
|
||||||
|
description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{ $labels.job}} cluster is {{ $value | humanizePercentage }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||||
|
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||||
|
expr: |
|
||||||
|
min by (namespace,service, integration) (
|
||||||
|
rate(alertmanager_notifications_failed_total{job="alertmanager",namespace="monitoring", integration!~`.*`}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{job="alertmanager",namespace="monitoring", integration!~`.*`}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: AlertmanagerConfigInconsistent
|
||||||
|
annotations:
|
||||||
|
description: Alertmanager instances within the {{ $labels.job}} cluster have different configurations.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
||||||
|
summary: Alertmanager instances within the same cluster have different configurations.
|
||||||
|
expr: |
|
||||||
|
count by (namespace,service) (
|
||||||
|
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager",namespace="monitoring"})
|
||||||
|
)
|
||||||
|
!= 1
|
||||||
|
for: 20m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerClusterDown
|
||||||
|
annotations:
|
||||||
|
description: "{{ $value | humanizePercentage }} of Alertmanager instances within the {{ $labels.job}} cluster have been up for less than half of the last 5m."
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
||||||
|
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
count by (namespace,service) (
|
||||||
|
avg_over_time(up{job="alertmanager",namespace="monitoring"}[5m]) < 0.5
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (namespace,service) (
|
||||||
|
up{job="alertmanager",namespace="monitoring"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>= 0.5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerClusterCrashlooping
|
||||||
|
annotations:
|
||||||
|
description: "{{ $value | humanizePercentage }} of Alertmanager instances within the {{ $labels.job}} cluster have restarted at least 5 times in the last 10m."
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
||||||
|
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
count by (namespace,service) (
|
||||||
|
changes(process_start_time_seconds{job="alertmanager",namespace="monitoring"}[10m]) > 4
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (namespace,service) (
|
||||||
|
up{job="alertmanager",namespace="monitoring"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>= 0.5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
@ -0,0 +1,7 @@
|
|||||||
|
---
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||||
|
kind: Component
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- PrometheusRule-alertmanager.yaml
|
@ -0,0 +1,64 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
|
||||||
|
namespace: metrics
|
||||||
|
|
||||||
|
|
||||||
|
labels:
|
||||||
|
- includeSelectors: true
|
||||||
|
pairs:
|
||||||
|
app.kubernetes.io/instance: cluster
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: alertmanager
|
||||||
|
|
||||||
|
|
||||||
|
patches:
|
||||||
|
- target:
|
||||||
|
kind: Alertmanager
|
||||||
|
name: cluster
|
||||||
|
patch: |-
|
||||||
|
- op: replace
|
||||||
|
path: /spec/version
|
||||||
|
value: v0.27.0
|
||||||
|
|
||||||
|
- op: replace
|
||||||
|
path: /spec/externalUrl
|
||||||
|
value: alert-manager.local
|
||||||
|
|
||||||
|
#
|
||||||
|
# Ingress Setup
|
||||||
|
#
|
||||||
|
# - patch: |-
|
||||||
|
# - op: replace
|
||||||
|
# path: /metadata/annotations/cert-manager.io~1cluster-issuer
|
||||||
|
# value: cluster
|
||||||
|
|
||||||
|
# - op: replace
|
||||||
|
# path: /metadata/annotations/cert-manager.io~1common-name
|
||||||
|
# value: prometheus.local
|
||||||
|
|
||||||
|
# - op: replace
|
||||||
|
# path: /spec/rules/0/host
|
||||||
|
# value: prometheus.local
|
||||||
|
|
||||||
|
# - op: replace
|
||||||
|
# path: /spec/tls/0/hosts/0
|
||||||
|
# value: prometheus.local
|
||||||
|
|
||||||
|
# target:
|
||||||
|
# kind: Ingress
|
||||||
|
# name: prometheus
|
||||||
|
# version: v1
|
||||||
|
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- ../../base
|
||||||
|
|
||||||
|
components:
|
||||||
|
- ../../components/dashboard
|
||||||
|
- ../../components/ingress
|
||||||
|
# - ../../components/metrics
|
||||||
|
# - ../../components/prometheus-rules
|
@ -48,4 +48,4 @@ components:
|
|||||||
# - ../../components/alertmanager
|
# - ../../components/alertmanager
|
||||||
- ../../components/prometheus-rules
|
- ../../components/prometheus-rules
|
||||||
- ../../components/ingress
|
- ../../components/ingress
|
||||||
- ../../components/grafana-datasource
|
# - ../../components/grafana-datasource
|
||||||
|
Reference in New Issue
Block a user