39
									
								
								templates/AlertManager-k8s.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								templates/AlertManager-k8s.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: monitoring.coreos.com/v1 | ||||||
|  | kind: Alertmanager | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   name: main | ||||||
|  |   namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} | ||||||
|  | spec: | ||||||
|  |   image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}" | ||||||
|  |   nodeSelector: | ||||||
|  |     kubernetes.io/os: linux | ||||||
|  |   podMetadata: | ||||||
|  |     labels: | ||||||
|  |       app.kubernetes.io/component: alert-router | ||||||
|  |       app.kubernetes.io/instance: main | ||||||
|  |       app.kubernetes.io/name: alertmanager | ||||||
|  |       app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |       app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |       app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   replicas: 3 | ||||||
|  |   resources: | ||||||
|  |     limits: | ||||||
|  |       cpu: 100m | ||||||
|  |       memory: 100Mi | ||||||
|  |     requests: | ||||||
|  |       cpu: 4m | ||||||
|  |       memory: 100Mi | ||||||
|  |   securityContext: | ||||||
|  |     fsGroup: 2000 | ||||||
|  |     runAsNonRoot: true | ||||||
|  |     runAsUser: 1000 | ||||||
|  |   serviceAccountName: alertmanager-main | ||||||
|  |   version: 0.25.0 | ||||||
							
								
								
									
										21
									
								
								templates/PodDisruptionBudget-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								templates/PodDisruptionBudget-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,21 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: policy/v1 | ||||||
|  | kind: PodDisruptionBudget | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   name: alertmanager-main | ||||||
|  |   namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} | ||||||
|  | spec: | ||||||
|  |   maxUnavailable: 1 | ||||||
|  |   selector: | ||||||
|  |     matchLabels: | ||||||
|  |       app.kubernetes.io/component: alert-router | ||||||
|  |       app.kubernetes.io/instance: main | ||||||
|  |       app.kubernetes.io/name: alertmanager | ||||||
|  |       app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
							
								
								
									
										141
									
								
								templates/PrometheusRule-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										141
									
								
								templates/PrometheusRule-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,141 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: monitoring.coreos.com/v1 | ||||||
|  | kind: PrometheusRule | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |     prometheus: k8s | ||||||
|  |     role: alert-rules | ||||||
|  |   name: alertmanager-main-rules | ||||||
|  |   namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} | ||||||
|  | spec: | ||||||
|  |   groups: | ||||||
|  |   - name: alertmanager.rules | ||||||
|  |     rules: | ||||||
|  |     - alert: AlertmanagerFailedReload | ||||||
|  |       annotations: | ||||||
|  |         description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}. | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload | ||||||
|  |         summary: Reloading an Alertmanager configuration has failed. | ||||||
|  |       expr: | | ||||||
|  |         # Without max_over_time, failed scrapes could create false negatives, see | ||||||
|  |         # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | ||||||
|  |         max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 | ||||||
|  |       for: 10m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: AlertmanagerMembersInconsistent | ||||||
|  |       annotations: | ||||||
|  |         description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster. | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent | ||||||
|  |         summary: A member of an Alertmanager cluster has not found all other cluster members. | ||||||
|  |       expr: | | ||||||
|  |         # Without max_over_time, failed scrapes could create false negatives, see | ||||||
|  |         # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | ||||||
|  |           max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) | ||||||
|  |         < on (namespace,service) group_left | ||||||
|  |           count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) | ||||||
|  |       for: 15m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: AlertmanagerFailedToSendAlerts | ||||||
|  |       annotations: | ||||||
|  |         description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}. | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts | ||||||
|  |         summary: An Alertmanager instance failed to send notifications. | ||||||
|  |       expr: | | ||||||
|  |         ( | ||||||
|  |           rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) | ||||||
|  |         / | ||||||
|  |           rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) | ||||||
|  |         ) | ||||||
|  |         > 0.01 | ||||||
|  |       for: 5m | ||||||
|  |       labels: | ||||||
|  |         severity: warning | ||||||
|  |     - alert: AlertmanagerClusterFailedToSendAlerts | ||||||
|  |       annotations: | ||||||
|  |         description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts | ||||||
|  |         summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. | ||||||
|  |       expr: | | ||||||
|  |         min by (namespace,service, integration) ( | ||||||
|  |           rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) | ||||||
|  |         / | ||||||
|  |           rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) | ||||||
|  |         ) | ||||||
|  |         > 0.01 | ||||||
|  |       for: 5m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: AlertmanagerClusterFailedToSendAlerts | ||||||
|  |       annotations: | ||||||
|  |         description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts | ||||||
|  |         summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. | ||||||
|  |       expr: | | ||||||
|  |         min by (namespace,service, integration) ( | ||||||
|  |           rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) | ||||||
|  |         / | ||||||
|  |           rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) | ||||||
|  |         ) | ||||||
|  |         > 0.01 | ||||||
|  |       for: 5m | ||||||
|  |       labels: | ||||||
|  |         severity: warning | ||||||
|  |     - alert: AlertmanagerConfigInconsistent | ||||||
|  |       annotations: | ||||||
|  |         description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations. | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent | ||||||
|  |         summary: Alertmanager instances within the same cluster have different configurations. | ||||||
|  |       expr: | | ||||||
|  |         count by (namespace,service) ( | ||||||
|  |           count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) | ||||||
|  |         ) | ||||||
|  |         != 1 | ||||||
|  |       for: 20m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: AlertmanagerClusterDown | ||||||
|  |       annotations: | ||||||
|  |         description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.' | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown | ||||||
|  |         summary: Half or more of the Alertmanager instances within the same cluster are down. | ||||||
|  |       expr: | | ||||||
|  |         ( | ||||||
|  |           count by (namespace,service) ( | ||||||
|  |             avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 | ||||||
|  |           ) | ||||||
|  |         / | ||||||
|  |           count by (namespace,service) ( | ||||||
|  |             up{job="alertmanager-main",namespace="monitoring"} | ||||||
|  |           ) | ||||||
|  |         ) | ||||||
|  |         >= 0.5 | ||||||
|  |       for: 5m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: AlertmanagerClusterCrashlooping | ||||||
|  |       annotations: | ||||||
|  |         description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.' | ||||||
|  |         runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping | ||||||
|  |         summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. | ||||||
|  |       expr: | | ||||||
|  |         ( | ||||||
|  |           count by (namespace,service) ( | ||||||
|  |             changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 | ||||||
|  |           ) | ||||||
|  |         / | ||||||
|  |           count by (namespace,service) ( | ||||||
|  |             up{job="alertmanager-main",namespace="monitoring"} | ||||||
|  |           ) | ||||||
|  |         ) | ||||||
|  |         >= 0.5 | ||||||
|  |       for: 5m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
							
								
								
									
										61
									
								
								templates/Secret-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								templates/Secret-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,61 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: v1 | ||||||
|  | kind: Secret | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   name: alertmanager-main | ||||||
|  |   namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} | ||||||
|  | stringData: | ||||||
|  |   alertmanager.yaml: |- | ||||||
|  |     "global": | ||||||
|  |       "resolve_timeout": "5m" | ||||||
|  |     "inhibit_rules": | ||||||
|  |     - "equal": | ||||||
|  |       - "namespace" | ||||||
|  |       - "alertname" | ||||||
|  |       "source_matchers": | ||||||
|  |       - "severity = critical" | ||||||
|  |       "target_matchers": | ||||||
|  |       - "severity =~ warning|info" | ||||||
|  |     - "equal": | ||||||
|  |       - "namespace" | ||||||
|  |       - "alertname" | ||||||
|  |       "source_matchers": | ||||||
|  |       - "severity = warning" | ||||||
|  |       "target_matchers": | ||||||
|  |       - "severity = info" | ||||||
|  |     - "equal": | ||||||
|  |       - "namespace" | ||||||
|  |       "source_matchers": | ||||||
|  |       - "alertname = InfoInhibitor" | ||||||
|  |       "target_matchers": | ||||||
|  |       - "severity = info" | ||||||
|  |     "receivers": | ||||||
|  |     - "name": "Default" | ||||||
|  |     - "name": "Watchdog" | ||||||
|  |     - "name": "Critical" | ||||||
|  |     - "name": "null" | ||||||
|  |     "route": | ||||||
|  |       "group_by": | ||||||
|  |       - "namespace" | ||||||
|  |       "group_interval": "5m" | ||||||
|  |       "group_wait": "30s" | ||||||
|  |       "receiver": "Default" | ||||||
|  |       "repeat_interval": "12h" | ||||||
|  |       "routes": | ||||||
|  |       - "matchers": | ||||||
|  |         - "alertname = Watchdog" | ||||||
|  |         "receiver": "Watchdog" | ||||||
|  |       - "matchers": | ||||||
|  |         - "alertname = InfoInhibitor" | ||||||
|  |         "receiver": "null" | ||||||
|  |       - "matchers": | ||||||
|  |         - "severity = critical" | ||||||
|  |         "receiver": "Critical" | ||||||
|  | type: Opaque | ||||||
							
								
								
									
										27
									
								
								templates/Service-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								templates/Service-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,27 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: v1 | ||||||
|  | kind: Service | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   name: alertmanager-main | ||||||
|  |   namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} | ||||||
|  | spec: | ||||||
|  |   ports: | ||||||
|  |   - name: web | ||||||
|  |     port: 9093 | ||||||
|  |     targetPort: web | ||||||
|  |   - name: reloader-web | ||||||
|  |     port: 8080 | ||||||
|  |     targetPort: reloader-web | ||||||
|  |   selector: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |   sessionAffinity: ClientIP | ||||||
							
								
								
									
										14
									
								
								templates/ServiceAccount-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								templates/ServiceAccount-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: v1 | ||||||
|  | automountServiceAccountToken: false | ||||||
|  | kind: ServiceAccount | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   name: alertmanager-main | ||||||
|  |   namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} | ||||||
							
								
								
									
										29
									
								
								templates/serviceMonitor-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								templates/serviceMonitor-alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,29 @@ | |||||||
|  | --- | ||||||
|  | apiVersion: monitoring.coreos.com/v1 | ||||||
|  | kind: ServiceMonitor | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: alert-router | ||||||
|  |     app.kubernetes.io/instance: main | ||||||
|  |     app.kubernetes.io/name: alertmanager | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |   name: alertmanager | ||||||
|  |   namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}" | ||||||
|  |    | ||||||
|  | spec: | ||||||
|  |   endpoints: | ||||||
|  |   - interval: 30s | ||||||
|  |     port: web | ||||||
|  |   - interval: 30s | ||||||
|  |     port: reloader-web | ||||||
|  |   namespaceSelector: | ||||||
|  |     matchNames: | ||||||
|  |     - "{{ .Values.nfc_monitoring.alert_manager.namespace }}" | ||||||
|  |   selector: | ||||||
|  |     matchLabels: | ||||||
|  |       app.kubernetes.io/component: alert-router | ||||||
|  |       app.kubernetes.io/instance: main | ||||||
|  |       app.kubernetes.io/name: alertmanager | ||||||
|  |       app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
		Reference in New Issue
	
	Block a user