{{ if .Values.nfc_monitoring.kube_monitor_proxy.enabled }} --- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: app.kubernetes.io/component: exporter app.kubernetes.io/instance: {{ $.Release.Name }} app.kubernetes.io/name: kube-prometheus app.kubernetes.io/part-of: {{ $.Chart.Name }} app.kubernetes.io/version: {{ $.Chart.Version }} app.kubernetes.io/managed-by: {{ $.Release.Service }} prometheus: {{ $.Release.Name }} role: alert-rules name: kube-prometheus-rules namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: general.rules rules: - alert: TargetDown annotations: description: '{{ `{{` }} printf "%.4g" $value }}% of the {{ `{{` }} $labels.job }}/{{ `{{` }} $labels.service }} targets in {{ `{{` }} $labels.namespace }} namespace are down.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 for: 10m labels: severity: warning - name: node-network rules: - alert: NodeNetworkInterfaceFlapping annotations: description: Network interface "{{ `{{` }} $labels.device }}" changing its up status often on node-exporter {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping summary: Network interface is often changing its status expr: | changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: severity: warning - name: kube-prometheus-node-recording.rules rules: - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) record: instance:node_cpu:rate:sum - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) record: cluster:node_cpu:sum_rate5m - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) record: cluster:node_cpu:ratio - name: kube-prometheus-general.rules rules: - expr: count without(instance, pod, node) (up == 1) record: count:up1 - expr: count without(instance, pod, node) (up == 0) record: count:up0 {{ end }}