feat(grafana-agent): Add manifests

ref: #5
2025-08-02 04:22:42 +00:00 · 2025-06-12 00:04:10 +09:30
parent 03c5fce8ff
commit 548052736f
16 changed files with 900 additions and 0 deletions
--- a/manifests/grafana-agent/base/ClusterRole-GrafanaAgent.yaml
+++ b/manifests/grafana-agent/base/ClusterRole-GrafanaAgent.yaml
@ -0,0 +1,37 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: grafana-agent
+  labels:
+    app.kubernetes.io/component: exporter
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - nodes
+      - nodes/proxy
+      - services
+      - endpoints
+      - pods
+      - events
+    verbs:
+      - get
+      - list
+      - watch
+  - nonResourceURLs:
+      - /metrics
+    verbs:
+      - get
+  - apiGroups:
+      - authentication.k8s.io
+    resources:
+      - tokenreviews
+    verbs:
+      - create
+  - apiGroups:
+      - authorization.k8s.io
+    resources:
+      - subjectaccessreviews
+    verbs:
+      - create
--- a/manifests/grafana-agent/base/ClusterRoleBinding-Grafana-Agent.yaml
+++ b/manifests/grafana-agent/base/ClusterRoleBinding-Grafana-Agent.yaml
@ -0,0 +1,15 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+  name: grafana-agent
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: grafana-agent
+subjects:
+  - kind: ServiceAccount
+    name: grafana-agent
+    namespace: default
--- a/manifests/grafana-agent/base/ConfigMap-GrafanaAgent.yaml
+++ b/manifests/grafana-agent/base/ConfigMap-GrafanaAgent.yaml
@ -0,0 +1,35 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+  name: grafana-agent
+data:
+  agent.yaml: |
+    metrics:
+      wal_directory: /tmp/wal
+
+
+    integrations:
+
+      node_exporter:
+        enabled: true
+        rootfs_path: /host/root
+        sysfs_path: /host/sys
+        procfs_path: /host/proc
+        udev_data_path: /host/root/run/udev/data
+
+        # collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/)
+        filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)"
+        #filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$"
+        filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
+
+
+        netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$"
+        netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$"
+
+
+        include_exporter_metrics: true
+        enable_collectors:
+          - uname
--- a/manifests/grafana-agent/base/Daemonset-GrafanaAgent.yaml
+++ b/manifests/grafana-agent/base/Daemonset-GrafanaAgent.yaml
@ -0,0 +1,122 @@
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+    metricsJob: node-exporter
+    cadvisormetricsJob: cadvisor
+    nodeExportermetricsJob: node
+  name: grafana-agent
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: exporter
+      metricsJob: node-exporter
+      cadvisormetricsJob: cadvisor
+      nodeExportermetricsJob: node
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/component: exporter
+        metricsJob: node-exporter
+        cadvisormetricsJob: cadvisor
+        nodeExportermetricsJob: node
+    spec:
+      automountServiceAccountToken: true
+      containers:
+        - args:
+            - --server.http.address=0.0.0.0:12345
+            - --config.file=/etc/agent/agent.yaml
+            - --config.expand-env=true
+          name: grafana-agent
+          image: "grafana/agent:v0.43.4"
+          #imagePullPolicy: Never
+          ports:
+            - containerPort: 12345
+              name: grafana-metrics
+              protocol: TCP
+          resources:
+            limits:
+              cpu: 1000m
+              memory: 180Mi
+            requests:
+              cpu: 40m
+              memory: 180Mi
+          securityContext:
+            capabilities:
+              add:
+                - SYS_TIME
+              # drop:
+              # - ALL
+            readOnlyRootFilesystem: false
+            privileged: true
+          volumeMounts:
+            - mountPath: /host/sys
+              mountPropagation: HostToContainer
+              name: sys
+              readOnly: true
+            - mountPath: /host/proc
+              mountPropagation: HostToContainer
+              name: proc
+              readOnly: true
+            - mountPath: /host/root
+              mountPropagation: HostToContainer
+              name: rootfs
+              readOnly: true
+            - mountPath: /var/log
+              mountPropagation: HostToContainer
+              name: logs
+              readOnly: true
+            - name: config
+              mountPath: "/etc/agent"
+              readOnly: false
+            - name: temp
+              mountPath: "/tmp"
+              readOnly: false
+            - name: agent-data
+              mountPath: "/etc/agent/data"
+              readOnly: false
+      dnsPolicy: ClusterFirst
+      volumes:
+        - hostPath:
+            path: /sys
+          name: sys
+        - hostPath:
+            path: /proc
+          name: proc
+        - hostPath:
+            path: /
+          name: rootfs
+        - hostPath:
+            path: /var/log
+          name: logs
+        - name: config
+          configMap:
+            name: grafana-agent
+            items:
+              - key: "agent.yaml"
+                path: "agent.yaml"
+        - name: temp
+          emptyDir: {}
+        - name: agent-data
+          emptyDir: {}
+
+        - name: var-run
+          hostPath:
+            path: /var/run
+        - name: containerd
+          hostPath:
+            path: /var/lib/contairnerd
+        - name: disk
+          hostPath:
+            path: /dev/disk
+
+      nodeSelector:
+        kubernetes.io/os: linux
+      hostNetwork: true
+      hostPID: true
+      priorityClassName: system-cluster-critical
+      serviceAccountName: grafana-agent
+      tolerations:
+        - operator: Exists
--- a/manifests/grafana-agent/base/Service-GrafanaAgent.yaml
+++ b/manifests/grafana-agent/base/Service-GrafanaAgent.yaml
@ -0,0 +1,18 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana-agent
+  labels:
+    app.kubernetes.io/component: exporter
+spec:
+  selector:
+    app.kubernetes.io/component: exporter
+  ports:
+    - name: grafana-metrics
+      port: 12345
+      targetPort: grafana-metrics
+    - name: kube-ctrl-mgr
+      port: 11257
+      targetPort: kube-ctrl-mgr
+  sessionAffinity: ClientIP
--- a/manifests/grafana-agent/base/ServiceAccount-GrafanaAgent.yaml
+++ b/manifests/grafana-agent/base/ServiceAccount-GrafanaAgent.yaml
@ -0,0 +1,8 @@
+---
+apiVersion: v1
+automountServiceAccountToken: false
+kind: ServiceAccount
+metadata:
+  name: grafana-agent
+  labels:
+    app.kubernetes.io/component: exporter
--- a/manifests/grafana-agent/base/kustomization.yaml
+++ b/manifests/grafana-agent/base/kustomization.yaml
@ -0,0 +1,26 @@
+---
+
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+
+resources:
+  - ServiceAccount-GrafanaAgent.yaml
+  - ClusterRole-GrafanaAgent.yaml
+  - ClusterRoleBinding-Grafana-Agent.yaml
+  - ConfigMap-GrafanaAgent.yaml
+  - Daemonset-GrafanaAgent.yaml
+  - Service-GrafanaAgent.yaml
+
+
+# replacements:
+#   - source:
+#       kind: DaemonSet
+#       name: grafana-agent
+#       fieldPath: metadata.namespace
+#     targets:
+#       - select:
+#           kind: ClusterRoleBinding
+#           name: grafana-agent
+#         fieldPaths:
+#           - subjects.[name=grafana-agent].namespace
--- a/manifests/grafana-agent/components/dashboard/GrafanaDashboard-node-exporter-full.yaml
+++ b/manifests/grafana-agent/components/dashboard/GrafanaDashboard-node-exporter-full.yaml
@ -0,0 +1,19 @@
+---
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDashboard
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+    app.kubernetes.io/instance: cluster
+  name: node-exporter
+spec:
+  allowCrossNamespaceImport: true
+  folder: 'General'
+  resyncPeriod: 24h
+  instanceSelector:
+    matchLabels:
+      dashboards: grafana
+  grafanaCom:
+    id: 1860
+    # revision: 32 # as @ 19-09-23
+    revision: 37 # as @ 11-01-25
--- a/manifests/grafana-agent/components/dashboard/kustomization.yaml
+++ b/manifests/grafana-agent/components/dashboard/kustomization.yaml
@ -0,0 +1,7 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+resources:
+  - GrafanaDashboard-node-exporter-full.yaml
--- a/manifests/grafana-agent/components/metrics/ServiceMonitor-Node.yaml
+++ b/manifests/grafana-agent/components/metrics/ServiceMonitor-Node.yaml
@ -0,0 +1,77 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+    app.kubernetes.io/instance: cluster
+    app.kubernetes.io/name: grafana-agent
+    app.kubernetes.io/part-of: monitoring
+  name: node
+spec:
+  endpoints:
+
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    interval: 5s
+    honorLabels: true
+    path: /metrics
+    port: grafana-metrics
+    scheme: http
+    relabelings:
+    - action: replace
+      regex: (.*)
+      replacement: $1
+      sourceLabels:
+      - __meta_kubernetes_pod_node_name
+      targetLabel: instance
+    - targetLabel: "job"
+      replacement: "node-exporter"
+    - action: replace
+      regex: (.*)
+      replacement: $1
+      sourceLabels:
+      - __meta_kubernetes_pod_node_name
+      targetLabel: node
+    metricRelabelings:
+      - action: drop
+        sourceLabels: [__name__]
+        regex: '(promtail_).*'
+
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    interval: 5s
+    honorLabels: true
+    path: /metrics
+    port: grafana-metrics
+    scheme: http
+    relabelings:
+    - action: replace
+      regex: (.*)
+      replacement: $1
+      sourceLabels:
+      - __meta_kubernetes_pod_node_name
+      targetLabel: instance
+    - targetLabel: "job"
+      replacement: "promtail"
+    - action: replace
+      regex: (.*)
+      replacement: $1
+      sourceLabels:
+      - __meta_kubernetes_pod_node_name
+      targetLabel: node
+    - action: replace
+      separator: /
+      sourceLabels:
+      - namespace
+      - job
+      targetLabel: job
+    metricRelabelings:
+      - action: keep
+        sourceLabels: [__name__]
+        regex: '(promtail_).*'
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: exporter
+      app.kubernetes.io/instance: cluster
+      app.kubernetes.io/name: grafana-agent
+      app.kubernetes.io/part-of: monitoring
+
--- a/manifests/grafana-agent/components/metrics/ServiceMonitor-node-exporter.yaml
+++ b/manifests/grafana-agent/components/metrics/ServiceMonitor-node-exporter.yaml
@ -0,0 +1,44 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+    app.kubernetes.io/instance: cluster
+    app.kubernetes.io/name: grafana-agent
+    app.kubernetes.io/part-of: monitoring
+  name: node-exporter
+  namespace: monitoring
+spec:
+  endpoints:
+
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    interval: 5s
+    honorLabels: true
+    path: /integrations/node_exporter/metrics
+    port: grafana-metrics
+    scheme: http
+    relabelings:
+    - action: replace
+      regex: (.*)
+      replacement: $1
+      sourceLabels:
+      - __meta_kubernetes_pod_node_name
+      targetLabel: instance
+    - targetLabel: "job"
+      replacement: "node-exporter"
+    - action: replace
+      regex: (.*)
+      replacement: $1
+      sourceLabels:
+      - __meta_kubernetes_pod_node_name
+      targetLabel: node
+    tlsConfig:
+      insecureSkipVerify: true
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: exporter
+      app.kubernetes.io/instance: cluster
+      app.kubernetes.io/name: grafana-agent
+      app.kubernetes.io/part-of: monitoring
+
--- a/manifests/grafana-agent/components/metrics/kustomization.yaml
+++ b/manifests/grafana-agent/components/metrics/kustomization.yaml
@ -0,0 +1,8 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+resources:
+  - ServiceMonitor-node-exporter.yaml
+  - ServiceMonitor-Node.yaml
--- a/manifests/grafana-agent/components/prometheus-rules/PrometheusRule-grafana-agent.yaml
+++ b/manifests/grafana-agent/components/prometheus-rules/PrometheusRule-grafana-agent.yaml
@ -0,0 +1,115 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+    app.kubernetes.io/instance: cluster
+    prometheus: prometheus
+    role: alert-rules
+  name: grafana-agent
+spec:
+  groups:
+    - name: grafana_agent
+      rules:
+        # - annotations:
+        #     description: "As Grafana Agent is being used, it's version is set as promtails"
+        - expr: |
+            agent_build_info
+          record: promtail_build_info
+    - name: promtail_rules
+      rules:
+        - expr:
+            histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
+            by (le, job))
+          record: job:promtail_request_duration_seconds:99quantile
+        - expr:
+            histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
+            by (le, job))
+          record: job:promtail_request_duration_seconds:50quantile
+        - expr:
+            sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
+            by (job)
+          record: job:promtail_request_duration_seconds:avg
+        - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
+          record: job:promtail_request_duration_seconds_bucket:sum_rate
+        - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
+          record: job:promtail_request_duration_seconds_sum:sum_rate
+        - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
+          record: job:promtail_request_duration_seconds_count:sum_rate
+        - expr:
+            histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
+            by (le, job, namespace))
+          record: job_namespace:promtail_request_duration_seconds:99quantile
+        - expr:
+            histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
+            by (le, job, namespace))
+          record: job_namespace:promtail_request_duration_seconds:50quantile
+        - expr:
+            sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
+            / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
+          record: job_namespace:promtail_request_duration_seconds:avg
+        - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
+          record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
+        - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
+          record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
+        - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
+          record: job_namespace:promtail_request_duration_seconds_count:sum_rate
+        - expr:
+            histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
+            by (le, job, status_code, namespace))
+          record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
+        - expr:
+            histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
+            by (le, job, status_code, namespace))
+          record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
+        - expr:
+            sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
+            namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
+            status_code, namespace)
+          record: job_status_code_namespace:promtail_request_duration_seconds:avg
+        - expr:
+            sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
+            namespace)
+          record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
+        - expr:
+            sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
+            namespace)
+          record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
+        - expr:
+            sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
+            namespace)
+          record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
+
+    - name: promtail_alerts
+      rules:
+        - alert: PromtailRequestsErrors
+          annotations:
+            message: |
+              {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
+          expr: |
+            100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
+              /
+            sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
+              > 10
+          for: 15m
+          labels:
+            severity: critical
+        - alert: PromtailRequestLatency
+          annotations:
+            message: |
+              {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
+          expr: |
+            job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
+          for: 15m
+          labels:
+            severity: critical
+        - alert: PromtailFileMissing
+          annotations:
+            message: |
+              {{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
+          expr: |
+            promtail_file_bytes_total unless promtail_read_bytes_total
+          for: 15m
+          labels:
+            severity: warning
--- a/manifests/grafana-agent/components/prometheus-rules/PrometheusRule-nodeExporter.yaml
+++ b/manifests/grafana-agent/components/prometheus-rules/PrometheusRule-nodeExporter.yaml
@ -0,0 +1,324 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    app.kubernetes.io/component: exporter
+    app.kubernetes.io/instance: cluster
+    prometheus: prometheus
+    role: alert-rules
+  name: node-exporter-rules
+spec:
+  groups:
+    - name: node-exporter
+      rules:
+        # {{ range $index, $node := (lookup "v1" "Node" "" "").items }}
+        # - alert: NodeExporterJobMissing-{{ $node.metadata.name }}
+        #   annotations:
+        #     summary: Node Exporter job missing for node {{ $node.metadata.name }}. (instance {{ $labels.instance }})
+        #     description: "Node Exporter job has disappeared\n  Node = {{ $node.metadata.name }}\n  Value = {{ $value }}\n  LABELS = {{ $labels }}"
+        #   expr: absent(up{job="node-exporter", node="{{ $node.metadata.name }}"})
+        #   for: 0m
+        #   labels:
+        #     severity: critical
+        # {{ end }}
+        - alert: NodeFilesystemSpaceFillingUp
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
+            summary: Filesystem is predicted to run out of space within the next 24 hours.
+          expr: |
+            (
+              node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
+            and
+              predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 1h
+          labels:
+            severity: warning
+        - alert: NodeFilesystemSpaceFillingUp
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
+            summary: Filesystem is predicted to run out of space within the next 4 hours.
+          expr: |
+            (
+              node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
+            and
+              predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 1h
+          labels:
+            severity: critical
+        - alert: NodeFilesystemAlmostOutOfSpace
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
+            summary: Filesystem has less than 5% space left.
+          expr: |
+            (
+              node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 30m
+          labels:
+            severity: warning
+        - alert: NodeFilesystemAlmostOutOfSpace
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
+            summary: Filesystem has less than 3% space left.
+          expr: |
+            (
+              node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 30m
+          labels:
+            severity: critical
+        - alert: NodeFilesystemFilesFillingUp
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
+            summary: Filesystem is predicted to run out of inodes within the next 24 hours.
+          expr: |
+            (
+              node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
+            and
+              predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 1h
+          labels:
+            severity: warning
+        - alert: NodeFilesystemFilesFillingUp
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
+            summary: Filesystem is predicted to run out of inodes within the next 4 hours.
+          expr: |
+            (
+              node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
+            and
+              predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 1h
+          labels:
+            severity: critical
+        - alert: NodeFilesystemAlmostOutOfFiles
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
+            summary: Filesystem has less than 5% inodes left.
+          expr: |
+            (
+              node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 1h
+          labels:
+            severity: warning
+        - alert: NodeFilesystemAlmostOutOfFiles
+          annotations:
+            description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
+            summary: Filesystem has less than 3% inodes left.
+          expr: |
+            (
+              node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
+            and
+              node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
+            )
+          for: 1h
+          labels:
+            severity: critical
+        - alert: NodeNetworkReceiveErrs
+          annotations:
+            description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
+            summary: Network interface is reporting many receive errors.
+          expr: |
+            rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+          for: 1h
+          labels:
+            severity: warning
+        - alert: NodeNetworkTransmitErrs
+          annotations:
+            description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
+            summary: Network interface is reporting many transmit errors.
+          expr: |
+            rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+          for: 1h
+          labels:
+            severity: warning
+        - alert: NodeHighNumberConntrackEntriesUsed
+          annotations:
+            description: "{{ $value | humanizePercentage }} of conntrack entries are used."
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
+            summary: Number of conntrack are getting close to the limit.
+          expr: |
+            (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
+          labels:
+            severity: warning
+        - alert: NodeTextFileCollectorScrapeError
+          annotations:
+            description: Node Exporter text file collector failed to scrape.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
+            summary: Node Exporter text file collector failed to scrape.
+          expr: |
+            node_textfile_scrape_error{job="node-exporter"} == 1
+          labels:
+            severity: warning
+        - alert: NodeClockSkewDetected
+          annotations:
+            description: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
+            summary: Clock skew detected.
+          expr: |
+            (
+              node_timex_offset_seconds{job="node-exporter"} > 0.05
+            and
+              deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
+            )
+            or
+            (
+              node_timex_offset_seconds{job="node-exporter"} < -0.05
+            and
+              deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
+            )
+          for: 10m
+          labels:
+            severity: warning
+        - alert: NodeClockNotSynchronising
+          annotations:
+            description: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
+            summary: Clock not synchronising.
+          expr: |
+            min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
+            and
+            node_timex_maxerror_seconds{job="node-exporter"} >= 16
+          for: 10m
+          labels:
+            severity: warning
+        - alert: NodeRAIDDegraded
+          annotations:
+            description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
+            summary: RAID Array is degraded
+          expr: |
+            node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
+          for: 15m
+          labels:
+            severity: critical
+        - alert: NodeRAIDDiskFailure
+          annotations:
+            description: At least one device in RAID array on {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
+            summary: Failed device in RAID array
+          expr: |
+            node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
+          labels:
+            severity: warning
+        - alert: NodeFileDescriptorLimit
+          annotations:
+            description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
+            summary: Kernel is predicted to exhaust file descriptors limit soon.
+          expr: |
+            (
+              node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
+            )
+          for: 15m
+          labels:
+            severity: warning
+        - alert: NodeFileDescriptorLimit
+          annotations:
+            description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
+            summary: Kernel is predicted to exhaust file descriptors limit soon.
+          expr: |
+            (
+              node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
+            )
+          for: 15m
+          labels:
+            severity: critical
+    - name: node-exporter.rules
+      rules:
+        - expr: |
+            count without (cpu, mode) (
+              node_cpu_seconds_total{job="node-exporter",mode="idle"}
+            )
+          record: instance:node_num_cpu:sum
+        - expr: |
+            1 - avg without (cpu) (
+              sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
+            )
+          record: instance:node_cpu_utilisation:rate5m
+        - expr: |
+            (
+              node_load1{job="node-exporter"}
+            /
+              instance:node_num_cpu:sum{job="node-exporter"}
+            )
+          record: instance:node_load1_per_cpu:ratio
+        - expr: |
+            1 - (
+              (
+                node_memory_MemAvailable_bytes{job="node-exporter"}
+                or
+                (
+                  node_memory_Buffers_bytes{job="node-exporter"}
+                  +
+                  node_memory_Cached_bytes{job="node-exporter"}
+                  +
+                  node_memory_MemFree_bytes{job="node-exporter"}
+                  +
+                  node_memory_Slab_bytes{job="node-exporter"}
+                )
+              )
+            /
+              node_memory_MemTotal_bytes{job="node-exporter"}
+            )
+          record: instance:node_memory_utilisation:ratio
+        - expr: |
+            rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
+          record: instance:node_vmstat_pgmajfault:rate5m
+        - expr: |
+            rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
+          record: instance_device:node_disk_io_time_seconds:rate5m
+        - expr: |
+            rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
+          record: instance_device:node_disk_io_time_weighted_seconds:rate5m
+        - expr: |
+            sum without (device) (
+              rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
+            )
+          record: instance:node_network_receive_bytes_excluding_lo:rate5m
+        - expr: |
+            sum without (device) (
+              rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
+            )
+          record: instance:node_network_transmit_bytes_excluding_lo:rate5m
+        - expr: |
+            sum without (device) (
+              rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
+            )
+          record: instance:node_network_receive_drop_excluding_lo:rate5m
+        - expr: |
+            sum without (device) (
+              rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
+            )
+          record: instance:node_network_transmit_drop_excluding_lo:rate5m
--- a/manifests/grafana-agent/components/prometheus-rules/kustomization.yaml
+++ b/manifests/grafana-agent/components/prometheus-rules/kustomization.yaml
@ -0,0 +1,8 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+resources:
+  - PrometheusRule-grafana-agent.yaml
+  - PrometheusRule-nodeExporter.yaml
--- a/manifests/grafana-agent/overlays/production/kustomization.yaml
+++ b/manifests/grafana-agent/overlays/production/kustomization.yaml
@ -0,0 +1,37 @@
+---
+
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+
+namespace: node-exporter
+
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/instance: cluster
+      app.kubernetes.io/name: grafana-agent
+      app.kubernetes.io/part-of: grafana-agent
+
+
+resources:
+  - ../../base
+
+# components:
+#   - ../../components/dashboard
+#   - ../../components/metrics
+#   - ../../components/prometheus-rules
+
+
+replacements:
+  - source:
+      kind: DaemonSet
+      name: grafana-agent
+      fieldPath: metadata.namespace
+    targets:
+      - select:
+          kind: ClusterRoleBinding
+          name: grafana-agent
+        fieldPaths:
+          - subjects.[name=grafana-agent].namespace