2
0
mirror of https://github.com/nofusscomputing/kubernetes.git synced 2025-08-02 04:22:42 +00:00

feat(grafana-agent): Add manifests

ref: #5
This commit is contained in:
2025-06-12 00:04:10 +09:30
parent 03c5fce8ff
commit 548052736f
16 changed files with 900 additions and 0 deletions

View File

@ -0,0 +1,37 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: grafana-agent
labels:
app.kubernetes.io/component: exporter
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
- events
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
verbs:
- get
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create

View File

@ -0,0 +1,15 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
name: grafana-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: grafana-agent
subjects:
- kind: ServiceAccount
name: grafana-agent
namespace: default

View File

@ -0,0 +1,35 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: exporter
name: grafana-agent
data:
agent.yaml: |
metrics:
wal_directory: /tmp/wal
integrations:
node_exporter:
enabled: true
rootfs_path: /host/root
sysfs_path: /host/sys
procfs_path: /host/proc
udev_data_path: /host/root/run/udev/data
# collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/)
filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)"
#filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$"
filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$"
netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$"
include_exporter_metrics: true
enable_collectors:
- uname

View File

@ -0,0 +1,122 @@
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: exporter
metricsJob: node-exporter
cadvisormetricsJob: cadvisor
nodeExportermetricsJob: node
name: grafana-agent
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
metricsJob: node-exporter
cadvisormetricsJob: cadvisor
nodeExportermetricsJob: node
template:
metadata:
labels:
app.kubernetes.io/component: exporter
metricsJob: node-exporter
cadvisormetricsJob: cadvisor
nodeExportermetricsJob: node
spec:
automountServiceAccountToken: true
containers:
- args:
- --server.http.address=0.0.0.0:12345
- --config.file=/etc/agent/agent.yaml
- --config.expand-env=true
name: grafana-agent
image: "grafana/agent:v0.43.4"
#imagePullPolicy: Never
ports:
- containerPort: 12345
name: grafana-metrics
protocol: TCP
resources:
limits:
cpu: 1000m
memory: 180Mi
requests:
cpu: 40m
memory: 180Mi
securityContext:
capabilities:
add:
- SYS_TIME
# drop:
# - ALL
readOnlyRootFilesystem: false
privileged: true
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/proc
mountPropagation: HostToContainer
name: proc
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: rootfs
readOnly: true
- mountPath: /var/log
mountPropagation: HostToContainer
name: logs
readOnly: true
- name: config
mountPath: "/etc/agent"
readOnly: false
- name: temp
mountPath: "/tmp"
readOnly: false
- name: agent-data
mountPath: "/etc/agent/data"
readOnly: false
dnsPolicy: ClusterFirst
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /proc
name: proc
- hostPath:
path: /
name: rootfs
- hostPath:
path: /var/log
name: logs
- name: config
configMap:
name: grafana-agent
items:
- key: "agent.yaml"
path: "agent.yaml"
- name: temp
emptyDir: {}
- name: agent-data
emptyDir: {}
- name: var-run
hostPath:
path: /var/run
- name: containerd
hostPath:
path: /var/lib/contairnerd
- name: disk
hostPath:
path: /dev/disk
nodeSelector:
kubernetes.io/os: linux
hostNetwork: true
hostPID: true
priorityClassName: system-cluster-critical
serviceAccountName: grafana-agent
tolerations:
- operator: Exists

View File

@ -0,0 +1,18 @@
---
apiVersion: v1
kind: Service
metadata:
name: grafana-agent
labels:
app.kubernetes.io/component: exporter
spec:
selector:
app.kubernetes.io/component: exporter
ports:
- name: grafana-metrics
port: 12345
targetPort: grafana-metrics
- name: kube-ctrl-mgr
port: 11257
targetPort: kube-ctrl-mgr
sessionAffinity: ClientIP

View File

@ -0,0 +1,8 @@
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
name: grafana-agent
labels:
app.kubernetes.io/component: exporter

View File

@ -0,0 +1,26 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ServiceAccount-GrafanaAgent.yaml
- ClusterRole-GrafanaAgent.yaml
- ClusterRoleBinding-Grafana-Agent.yaml
- ConfigMap-GrafanaAgent.yaml
- Daemonset-GrafanaAgent.yaml
- Service-GrafanaAgent.yaml
# replacements:
# - source:
# kind: DaemonSet
# name: grafana-agent
# fieldPath: metadata.namespace
# targets:
# - select:
# kind: ClusterRoleBinding
# name: grafana-agent
# fieldPaths:
# - subjects.[name=grafana-agent].namespace

View File

@ -0,0 +1,19 @@
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
name: node-exporter
spec:
allowCrossNamespaceImport: true
folder: 'General'
resyncPeriod: 24h
instanceSelector:
matchLabels:
dashboards: grafana
grafanaCom:
id: 1860
# revision: 32 # as @ 19-09-23
revision: 37 # as @ 11-01-25

View File

@ -0,0 +1,7 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- GrafanaDashboard-node-exporter-full.yaml

View File

@ -0,0 +1,77 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: grafana-agent
app.kubernetes.io/part-of: monitoring
name: node
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 5s
honorLabels: true
path: /metrics
port: grafana-metrics
scheme: http
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
- targetLabel: "job"
replacement: "node-exporter"
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: node
metricRelabelings:
- action: drop
sourceLabels: [__name__]
regex: '(promtail_).*'
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 5s
honorLabels: true
path: /metrics
port: grafana-metrics
scheme: http
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
- targetLabel: "job"
replacement: "promtail"
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: node
- action: replace
separator: /
sourceLabels:
- namespace
- job
targetLabel: job
metricRelabelings:
- action: keep
sourceLabels: [__name__]
regex: '(promtail_).*'
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: grafana-agent
app.kubernetes.io/part-of: monitoring

View File

@ -0,0 +1,44 @@
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: grafana-agent
app.kubernetes.io/part-of: monitoring
name: node-exporter
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 5s
honorLabels: true
path: /integrations/node_exporter/metrics
port: grafana-metrics
scheme: http
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
- targetLabel: "job"
replacement: "node-exporter"
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: node
tlsConfig:
insecureSkipVerify: true
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: grafana-agent
app.kubernetes.io/part-of: monitoring

View File

@ -0,0 +1,8 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- ServiceMonitor-node-exporter.yaml
- ServiceMonitor-Node.yaml

View File

@ -0,0 +1,115 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
prometheus: prometheus
role: alert-rules
name: grafana-agent
spec:
groups:
- name: grafana_agent
rules:
# - annotations:
# description: "As Grafana Agent is being used, it's version is set as promtails"
- expr: |
agent_build_info
record: promtail_build_info
- name: promtail_rules
rules:
- expr:
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:99quantile
- expr:
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:50quantile
- expr:
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
by (job)
record: job:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate
- expr:
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile
- expr:
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile
- expr:
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
- expr:
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
- expr:
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
- expr:
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg
- expr:
sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr:
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr:
sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
- name: promtail_alerts
rules:
- alert: PromtailRequestsErrors
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
- alert: PromtailRequestLatency
annotations:
message: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
- alert: PromtailFileMissing
annotations:
message: |
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: warning

View File

@ -0,0 +1,324 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: cluster
prometheus: prometheus
role: alert-rules
name: node-exporter-rules
spec:
groups:
- name: node-exporter
rules:
# {{ range $index, $node := (lookup "v1" "Node" "" "").items }}
# - alert: NodeExporterJobMissing-{{ $node.metadata.name }}
# annotations:
# summary: Node Exporter job missing for node {{ $node.metadata.name }}. (instance {{ $labels.instance }})
# description: "Node Exporter job has disappeared\n Node = {{ $node.metadata.name }}\n Value = {{ $value }}\n LABELS = {{ $labels }}"
# expr: absent(up{job="node-exporter", node="{{ $node.metadata.name }}"})
# for: 0m
# labels:
# severity: critical
# {{ end }}
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
- name: node-exporter.rules
rules:
- expr: |
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@ -0,0 +1,8 @@
---
apiVersion: kustomize.config.k8s.io/v1alpha1
kind: Component
resources:
- PrometheusRule-grafana-agent.yaml
- PrometheusRule-nodeExporter.yaml

View File

@ -0,0 +1,37 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: node-exporter
labels:
- includeSelectors: true
pairs:
app.kubernetes.io/instance: cluster
app.kubernetes.io/name: grafana-agent
app.kubernetes.io/part-of: grafana-agent
resources:
- ../../base
# components:
# - ../../components/dashboard
# - ../../components/metrics
# - ../../components/prometheus-rules
replacements:
- source:
kind: DaemonSet
name: grafana-agent
fieldPath: metadata.namespace
targets:
- select:
kind: ClusterRoleBinding
name: grafana-agent
fieldPaths:
- subjects.[name=grafana-agent].namespace