diff --git a/templates/PrometheusRule-nodeExporter.yaml b/templates/PrometheusRule-nodeExporter.yaml new file mode 100644 index 0000000..5063919 --- /dev/null +++ b/templates/PrometheusRule-nodeExporter.yaml @@ -0,0 +1,318 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: monitoring +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{ `{{` }} $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{ `{{` }} $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ `{{` }} $labels.device }}' on {{ `{{` }} $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ `{{` }} $labels.instance }} failed. Array '{{ `{{` }} $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - name: node-exporter.rules + rules: + - expr: | + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m