mirror of
https://github.com/nofusscomputing/kubernetes.git
synced 2025-08-02 04:22:42 +00:00
37
manifests/grafana-agent/base/ClusterRole-GrafanaAgent.yaml
Normal file
37
manifests/grafana-agent/base/ClusterRole-GrafanaAgent.yaml
Normal file
@ -0,0 +1,37 @@
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- nodes/proxy
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
- events
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- nonResourceURLs:
|
||||
- /metrics
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- authentication.k8s.io
|
||||
resources:
|
||||
- tokenreviews
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- authorization.k8s.io
|
||||
resources:
|
||||
- subjectaccessreviews
|
||||
verbs:
|
||||
- create
|
@ -0,0 +1,15 @@
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
name: grafana-agent
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: grafana-agent
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: grafana-agent
|
||||
namespace: default
|
35
manifests/grafana-agent/base/ConfigMap-GrafanaAgent.yaml
Normal file
35
manifests/grafana-agent/base/ConfigMap-GrafanaAgent.yaml
Normal file
@ -0,0 +1,35 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
name: grafana-agent
|
||||
data:
|
||||
agent.yaml: |
|
||||
metrics:
|
||||
wal_directory: /tmp/wal
|
||||
|
||||
|
||||
integrations:
|
||||
|
||||
node_exporter:
|
||||
enabled: true
|
||||
rootfs_path: /host/root
|
||||
sysfs_path: /host/sys
|
||||
procfs_path: /host/proc
|
||||
udev_data_path: /host/root/run/udev/data
|
||||
|
||||
# collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/)
|
||||
filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)"
|
||||
#filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$"
|
||||
filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
|
||||
|
||||
|
||||
netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$"
|
||||
netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$"
|
||||
|
||||
|
||||
include_exporter_metrics: true
|
||||
enable_collectors:
|
||||
- uname
|
122
manifests/grafana-agent/base/Daemonset-GrafanaAgent.yaml
Normal file
122
manifests/grafana-agent/base/Daemonset-GrafanaAgent.yaml
Normal file
@ -0,0 +1,122 @@
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
metricsJob: node-exporter
|
||||
cadvisormetricsJob: cadvisor
|
||||
nodeExportermetricsJob: node
|
||||
name: grafana-agent
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: exporter
|
||||
metricsJob: node-exporter
|
||||
cadvisormetricsJob: cadvisor
|
||||
nodeExportermetricsJob: node
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
metricsJob: node-exporter
|
||||
cadvisormetricsJob: cadvisor
|
||||
nodeExportermetricsJob: node
|
||||
spec:
|
||||
automountServiceAccountToken: true
|
||||
containers:
|
||||
- args:
|
||||
- --server.http.address=0.0.0.0:12345
|
||||
- --config.file=/etc/agent/agent.yaml
|
||||
- --config.expand-env=true
|
||||
name: grafana-agent
|
||||
image: "grafana/agent:v0.43.4"
|
||||
#imagePullPolicy: Never
|
||||
ports:
|
||||
- containerPort: 12345
|
||||
name: grafana-metrics
|
||||
protocol: TCP
|
||||
resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 180Mi
|
||||
requests:
|
||||
cpu: 40m
|
||||
memory: 180Mi
|
||||
securityContext:
|
||||
capabilities:
|
||||
add:
|
||||
- SYS_TIME
|
||||
# drop:
|
||||
# - ALL
|
||||
readOnlyRootFilesystem: false
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /host/sys
|
||||
mountPropagation: HostToContainer
|
||||
name: sys
|
||||
readOnly: true
|
||||
- mountPath: /host/proc
|
||||
mountPropagation: HostToContainer
|
||||
name: proc
|
||||
readOnly: true
|
||||
- mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
name: rootfs
|
||||
readOnly: true
|
||||
- mountPath: /var/log
|
||||
mountPropagation: HostToContainer
|
||||
name: logs
|
||||
readOnly: true
|
||||
- name: config
|
||||
mountPath: "/etc/agent"
|
||||
readOnly: false
|
||||
- name: temp
|
||||
mountPath: "/tmp"
|
||||
readOnly: false
|
||||
- name: agent-data
|
||||
mountPath: "/etc/agent/data"
|
||||
readOnly: false
|
||||
dnsPolicy: ClusterFirst
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /sys
|
||||
name: sys
|
||||
- hostPath:
|
||||
path: /proc
|
||||
name: proc
|
||||
- hostPath:
|
||||
path: /
|
||||
name: rootfs
|
||||
- hostPath:
|
||||
path: /var/log
|
||||
name: logs
|
||||
- name: config
|
||||
configMap:
|
||||
name: grafana-agent
|
||||
items:
|
||||
- key: "agent.yaml"
|
||||
path: "agent.yaml"
|
||||
- name: temp
|
||||
emptyDir: {}
|
||||
- name: agent-data
|
||||
emptyDir: {}
|
||||
|
||||
- name: var-run
|
||||
hostPath:
|
||||
path: /var/run
|
||||
- name: containerd
|
||||
hostPath:
|
||||
path: /var/lib/contairnerd
|
||||
- name: disk
|
||||
hostPath:
|
||||
path: /dev/disk
|
||||
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
priorityClassName: system-cluster-critical
|
||||
serviceAccountName: grafana-agent
|
||||
tolerations:
|
||||
- operator: Exists
|
18
manifests/grafana-agent/base/Service-GrafanaAgent.yaml
Normal file
18
manifests/grafana-agent/base/Service-GrafanaAgent.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/component: exporter
|
||||
ports:
|
||||
- name: grafana-metrics
|
||||
port: 12345
|
||||
targetPort: grafana-metrics
|
||||
- name: kube-ctrl-mgr
|
||||
port: 11257
|
||||
targetPort: kube-ctrl-mgr
|
||||
sessionAffinity: ClientIP
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
automountServiceAccountToken: false
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: grafana-agent
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
26
manifests/grafana-agent/base/kustomization.yaml
Normal file
26
manifests/grafana-agent/base/kustomization.yaml
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
|
||||
resources:
|
||||
- ServiceAccount-GrafanaAgent.yaml
|
||||
- ClusterRole-GrafanaAgent.yaml
|
||||
- ClusterRoleBinding-Grafana-Agent.yaml
|
||||
- ConfigMap-GrafanaAgent.yaml
|
||||
- Daemonset-GrafanaAgent.yaml
|
||||
- Service-GrafanaAgent.yaml
|
||||
|
||||
|
||||
# replacements:
|
||||
# - source:
|
||||
# kind: DaemonSet
|
||||
# name: grafana-agent
|
||||
# fieldPath: metadata.namespace
|
||||
# targets:
|
||||
# - select:
|
||||
# kind: ClusterRoleBinding
|
||||
# name: grafana-agent
|
||||
# fieldPaths:
|
||||
# - subjects.[name=grafana-agent].namespace
|
@ -0,0 +1,19 @@
|
||||
---
|
||||
apiVersion: grafana.integreatly.org/v1beta1
|
||||
kind: GrafanaDashboard
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
name: node-exporter
|
||||
spec:
|
||||
allowCrossNamespaceImport: true
|
||||
folder: 'General'
|
||||
resyncPeriod: 24h
|
||||
instanceSelector:
|
||||
matchLabels:
|
||||
dashboards: grafana
|
||||
grafanaCom:
|
||||
id: 1860
|
||||
# revision: 32 # as @ 19-09-23
|
||||
revision: 37 # as @ 11-01-25
|
@ -0,0 +1,7 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
|
||||
|
||||
resources:
|
||||
- GrafanaDashboard-node-exporter-full.yaml
|
@ -0,0 +1,77 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
app.kubernetes.io/name: grafana-agent
|
||||
app.kubernetes.io/part-of: monitoring
|
||||
name: node
|
||||
spec:
|
||||
endpoints:
|
||||
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 5s
|
||||
honorLabels: true
|
||||
path: /metrics
|
||||
port: grafana-metrics
|
||||
scheme: http
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: instance
|
||||
- targetLabel: "job"
|
||||
replacement: "node-exporter"
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: node
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
sourceLabels: [__name__]
|
||||
regex: '(promtail_).*'
|
||||
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 5s
|
||||
honorLabels: true
|
||||
path: /metrics
|
||||
port: grafana-metrics
|
||||
scheme: http
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: instance
|
||||
- targetLabel: "job"
|
||||
replacement: "promtail"
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: node
|
||||
- action: replace
|
||||
separator: /
|
||||
sourceLabels:
|
||||
- namespace
|
||||
- job
|
||||
targetLabel: job
|
||||
metricRelabelings:
|
||||
- action: keep
|
||||
sourceLabels: [__name__]
|
||||
regex: '(promtail_).*'
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
app.kubernetes.io/name: grafana-agent
|
||||
app.kubernetes.io/part-of: monitoring
|
||||
|
@ -0,0 +1,44 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
app.kubernetes.io/name: grafana-agent
|
||||
app.kubernetes.io/part-of: monitoring
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
||||
spec:
|
||||
endpoints:
|
||||
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 5s
|
||||
honorLabels: true
|
||||
path: /integrations/node_exporter/metrics
|
||||
port: grafana-metrics
|
||||
scheme: http
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: instance
|
||||
- targetLabel: "job"
|
||||
replacement: "node-exporter"
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: node
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
app.kubernetes.io/name: grafana-agent
|
||||
app.kubernetes.io/part-of: monitoring
|
||||
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
|
||||
|
||||
resources:
|
||||
- ServiceMonitor-node-exporter.yaml
|
||||
- ServiceMonitor-Node.yaml
|
@ -0,0 +1,115 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
prometheus: prometheus
|
||||
role: alert-rules
|
||||
name: grafana-agent
|
||||
spec:
|
||||
groups:
|
||||
- name: grafana_agent
|
||||
rules:
|
||||
# - annotations:
|
||||
# description: "As Grafana Agent is being used, it's version is set as promtails"
|
||||
- expr: |
|
||||
agent_build_info
|
||||
record: promtail_build_info
|
||||
- name: promtail_rules
|
||||
rules:
|
||||
- expr:
|
||||
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:promtail_request_duration_seconds:99quantile
|
||||
- expr:
|
||||
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:promtail_request_duration_seconds:50quantile
|
||||
- expr:
|
||||
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
|
||||
by (job)
|
||||
record: job:promtail_request_duration_seconds:avg
|
||||
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
|
||||
record: job:promtail_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:promtail_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:promtail_request_duration_seconds_count:sum_rate
|
||||
- expr:
|
||||
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, namespace))
|
||||
record: job_namespace:promtail_request_duration_seconds:99quantile
|
||||
- expr:
|
||||
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, namespace))
|
||||
record: job_namespace:promtail_request_duration_seconds:50quantile
|
||||
- expr:
|
||||
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
||||
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds:avg
|
||||
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
|
||||
- expr:
|
||||
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, status_code, namespace))
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
|
||||
- expr:
|
||||
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, status_code, namespace))
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
|
||||
- expr:
|
||||
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
||||
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
|
||||
status_code, namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds:avg
|
||||
- expr:
|
||||
sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
|
||||
namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
||||
- expr:
|
||||
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
||||
namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
|
||||
- expr:
|
||||
sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
|
||||
namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
|
||||
|
||||
- name: promtail_alerts
|
||||
rules:
|
||||
- alert: PromtailRequestsErrors
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
||||
expr: |
|
||||
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
|
||||
/
|
||||
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PromtailRequestLatency
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
||||
expr: |
|
||||
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PromtailFileMissing
|
||||
annotations:
|
||||
message: |
|
||||
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
|
||||
expr: |
|
||||
promtail_file_bytes_total unless promtail_read_bytes_total
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
@ -0,0 +1,324 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: exporter
|
||||
app.kubernetes.io/instance: cluster
|
||||
prometheus: prometheus
|
||||
role: alert-rules
|
||||
name: node-exporter-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
# {{ range $index, $node := (lookup "v1" "Node" "" "").items }}
|
||||
# - alert: NodeExporterJobMissing-{{ $node.metadata.name }}
|
||||
# annotations:
|
||||
# summary: Node Exporter job missing for node {{ $node.metadata.name }}. (instance {{ $labels.instance }})
|
||||
# description: "Node Exporter job has disappeared\n Node = {{ $node.metadata.name }}\n Value = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
# expr: absent(up{job="node-exporter", node="{{ $node.metadata.name }}"})
|
||||
# for: 0m
|
||||
# labels:
|
||||
# severity: critical
|
||||
# {{ end }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: |
|
||||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: |
|
||||
node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: |
|
||||
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: |
|
||||
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- expr: |
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: |
|
||||
rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
@ -0,0 +1,8 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1alpha1
|
||||
kind: Component
|
||||
|
||||
|
||||
resources:
|
||||
- PrometheusRule-grafana-agent.yaml
|
||||
- PrometheusRule-nodeExporter.yaml
|
@ -0,0 +1,37 @@
|
||||
---
|
||||
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
|
||||
namespace: node-exporter
|
||||
|
||||
|
||||
labels:
|
||||
- includeSelectors: true
|
||||
pairs:
|
||||
app.kubernetes.io/instance: cluster
|
||||
app.kubernetes.io/name: grafana-agent
|
||||
app.kubernetes.io/part-of: grafana-agent
|
||||
|
||||
|
||||
resources:
|
||||
- ../../base
|
||||
|
||||
# components:
|
||||
# - ../../components/dashboard
|
||||
# - ../../components/metrics
|
||||
# - ../../components/prometheus-rules
|
||||
|
||||
|
||||
replacements:
|
||||
- source:
|
||||
kind: DaemonSet
|
||||
name: grafana-agent
|
||||
fieldPath: metadata.namespace
|
||||
targets:
|
||||
- select:
|
||||
kind: ClusterRoleBinding
|
||||
name: grafana-agent
|
||||
fieldPaths:
|
||||
- subjects.[name=grafana-agent].namespace
|
Reference in New Issue
Block a user