feat(prometheus): Add manifests

ref: #5
2025-08-02 04:22:42 +00:00 · 2025-06-11 23:14:22 +09:30
parent 02ccf8dbce
commit a4a2348ea4
20 changed files with 816 additions and 0 deletions
--- a/manifests/prometheus/base/ClusterRole-prometheus-config.yaml
+++ b/manifests/prometheus/base/ClusterRole-prometheus-config.yaml
@ -0,0 +1,14 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus-config
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - configmaps
+    verbs:
+      - get
--- a/manifests/prometheus/base/ClusterRole-prometheus-monitoring.yaml
+++ b/manifests/prometheus/base/ClusterRole-prometheus-monitoring.yaml
@ -0,0 +1,34 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus-monitoring
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - services
+      - endpoints
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - extensions
+    resources:
+      - ingresses
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - networking.k8s.io
+    resources:
+      - ingresses
+    verbs:
+      - get
+      - list
+      - watch
--- a/manifests/prometheus/base/ClusterRole-prometheus.yaml
+++ b/manifests/prometheus/base/ClusterRole-prometheus.yaml
@ -0,0 +1,18 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - nodes/metrics
+    verbs:
+      - get
+  - nonResourceURLs:
+      - /metrics
+    verbs:
+      - get
--- a/manifests/prometheus/base/ClusterRoleBinding-prometheus-config.yaml
+++ b/manifests/prometheus/base/ClusterRoleBinding-prometheus-config.yaml
@ -0,0 +1,15 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus-config
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus-config
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: default
--- a/manifests/prometheus/base/ClusterRoleBinding-prometheus-monitoring.yaml
+++ b/manifests/prometheus/base/ClusterRoleBinding-prometheus-monitoring.yaml
@ -0,0 +1,16 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus-monitoring
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus-monitoring
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: default
+
--- a/manifests/prometheus/base/ClusterRoleBinding-prometheus.yaml
+++ b/manifests/prometheus/base/ClusterRoleBinding-prometheus.yaml
@ -0,0 +1,15 @@
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: prometheus
+subjects:
+  - kind: ServiceAccount
+    name: prometheus
+    namespace: default
--- a/manifests/prometheus/base/Prometheus.yaml
+++ b/manifests/prometheus/base/Prometheus.yaml
@ -0,0 +1,52 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: cluster
+spec:
+  alerting:
+    alertmanagers: []
+  #   - apiVersion: v2
+  #     name: alertmanager-{{ $.Release.Name }}
+  #     namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}"
+  #     port: web
+  enableFeatures: []
+  enableRemoteWriteReceiver: false
+  externalLabels: {}
+  # image: {{ .Values.nfc_monitoring.prometheus.image.name }}:{{ .Values.nfc_monitoring.prometheus.image.tag}}
+  nodeSelector:
+    kubernetes.io/os: linux
+  podMetadata:
+    labels:
+      app.kubernetes.io/component: metrics
+  podMonitorNamespaceSelector: {}
+  podMonitorSelector: {}
+  probeNamespaceSelector: {}
+  probeSelector: {}
+  replicas: 1
+  resources:
+    requests:
+      memory: 400Mi
+  retentionSize: 2GB
+  ruleNamespaceSelector: {}
+  ruleSelector:
+    matchLabels:
+      app.kubernetes.io/instance: -set-by-replacement-
+  securityContext:
+    fsGroup: 2000
+    runAsNonRoot: true
+    runAsUser: 1000
+  serviceAccountName: prometheus
+  serviceMonitorNamespaceSelector: {}
+  serviceMonitorSelector: {}
+  storage:
+    volumeClaimTemplate:
+      spec:
+        accessModes:
+          - ReadWriteOnce
+        resources:
+          requests:
+            storage: 5Gi
+  version: -set-by-patch-
--- a/manifests/prometheus/base/Service.yaml
+++ b/manifests/prometheus/base/Service.yaml
@ -0,0 +1,18 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus
+spec:
+  ports:
+    - name: web
+      port: 9090
+      targetPort: web
+    - name: reloader-web
+      port: 8080
+      targetPort: reloader-web
+  selector:
+    app.kubernetes.io/component: metrics
+  sessionAffinity: ClientIP
--- a/manifests/prometheus/base/ServiceAccount.yaml
+++ b/manifests/prometheus/base/ServiceAccount.yaml
@ -0,0 +1,8 @@
+---
+apiVersion: v1
+automountServiceAccountToken: true
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus
--- a/manifests/prometheus/base/ServiceMonitor.yaml
+++ b/manifests/prometheus/base/ServiceMonitor.yaml
@ -0,0 +1,25 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus
+spec:
+  endpoints:
+    - interval: 30s
+      port: web
+      relabelings:
+        - action: replace
+          regex: (.*)
+          replacement: $1
+          sourceLabels:
+            - __meta_kubernetes_pod_name
+          targetLabel: instance
+        - targetLabel: "job"
+          replacement: "prometheus"
+    - interval: 30s
+      port: reloader-web
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: metrics
--- a/manifests/prometheus/base/kustomization.yaml
+++ b/manifests/prometheus/base/kustomization.yaml
@ -0,0 +1,91 @@
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+
+namespace: metrics
+
+#
+# Note: Dont use images, use patch below for spec.version
+#
+# images:
+#   - name:
+#     newTag:
+
+
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/instance: cluster
+      app.kubernetes.io/name: prometheus
+      app.kubernetes.io/part-of: prometheus
+
+
+resources:
+  - ClusterRole-prometheus.yaml
+  - ClusterRole-prometheus-config.yaml
+  - ClusterRole-prometheus-monitoring.yaml
+  - ClusterRoleBinding-prometheus.yaml
+  - ClusterRoleBinding-prometheus-config.yaml
+  - ClusterRoleBinding-prometheus-monitoring.yaml
+  - ServiceAccount.yaml
+  - Prometheus.yaml
+  - Service.yaml
+  - ServiceMonitor.yaml
+
+
+patches:
+  - target:
+      kind: Prometheus
+      name: cluster
+    patch: |-
+      - op: replace
+        path: /spec/version
+        value: 2.47.0
+
+
+replacements:
+  - source:
+      kind: Prometheus
+      name: cluster
+      fieldPath: metadata.namespace
+    targets:
+      - select:
+          kind: ClusterRoleBinding
+          name: prometheus
+        fieldPaths:
+          - subjects.[name=prometheus].namespace
+
+      - select:
+          kind: ClusterRoleBinding
+          name: prometheus-config
+        fieldPaths:
+          - subjects.[name=prometheus].namespace
+
+      - select:
+          kind: ClusterRoleBinding
+          name: prometheus-monitoring
+        fieldPaths:
+          - subjects.[name=prometheus].namespace
+
+  - source:
+      kind: Prometheus
+      name: cluster
+      fieldPath: metadata.labels
+    targets:
+      - select:
+          kind: Prometheus
+          name: cluster
+        fieldPaths:
+          - spec.podMetadata.labels
+
+  - source:
+      kind: Prometheus
+      name: cluster
+      fieldPath: metadata.labels.[app.kubernetes.io/instance]
+    targets:
+      - select:
+          kind: Prometheus
+          name: cluster
+        fieldPaths:
+          - spec.ruleSelector.matchLabels.[app.kubernetes.io/instance]
--- a/manifests/prometheus/components/alertmanager/kustomization.yaml
+++ b/manifests/prometheus/components/alertmanager/kustomization.yaml
@ -0,0 +1,21 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+# resources:
+#   - GrafanaDatasource-Prometheus.yaml
+
+
+patches:
+  - patch: |-
+      - op: replace
+        path: /spec/alerting/alertmanagers
+        value:
+          - apiVersion: v2
+            name: alertmanager-cluster
+            namespace: alert
+            port: web
+    target:
+      kind: Prometheus
+      name: cluster
--- a/manifests/prometheus/components/grafana-datasource/GrafanaDatasource-Prometheus.yaml
+++ b/manifests/prometheus/components/grafana-datasource/GrafanaDatasource-Prometheus.yaml
@ -0,0 +1,24 @@
+---
+apiVersion: grafana.integreatly.org/v1beta1
+kind: GrafanaDatasource
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/instance: cluster
+    app.kubernetes.io/name: prometheus
+    app.kubernetes.io/part-of: monitoring
+  name: prometheus
+spec:
+  allowCrossNamespaceImport: true
+  datasource:
+    access: proxy
+    type: prometheus
+    isDefault: true
+    # jsonData:
+    #   timeInterval: 5s
+    #   tlsSkipVerify: true
+    name: Earth Prometheus
+    url: -prometheus-http-url-
+  instanceSelector:
+    matchLabels:
+      dashboards: grafana
--- a/manifests/prometheus/components/grafana-datasource/kustomization.yaml
+++ b/manifests/prometheus/components/grafana-datasource/kustomization.yaml
@ -0,0 +1,18 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+resources:
+  - GrafanaDatasource-Prometheus.yaml
+
+
+patches:
+  - patch: |-
+      - op: replace
+        path: /spec/datasource/url
+        value: http://prometheus.metrics.svc:9090
+
+    target:
+      kind: GrafanaDatasource
+      name: prometheus
--- a/manifests/prometheus/components/ingress/Ingress.yaml
+++ b/manifests/prometheus/components/ingress/Ingress.yaml
@ -0,0 +1,34 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  annotations:
+    cert-manager.io/common-name: prometheus.local
+    cert-manager.io/duration: 2160h
+    cert-manager.io/private-key-algorithm: ECDSA
+    cert-manager.io/private-key-rotation-policy: Always
+    cert-manager.io/private-key-size: "384"
+    cert-manager.io/subject-countries: N/A
+    cert-manager.io/subject-organizationalunits: N/A
+    cert-manager.io/subject-organizations: N/A
+    cert-manager.io/subject-provinces: N/A
+  labels:
+    app.kubernetes.io/component: metrics
+  name: prometheus
+spec:
+  ingressClassName: nginx
+  rules:
+    - host: prometheus.local
+      http:
+        paths:
+          - backend:
+              service:
+                name: prometheus
+                port:
+                  name: web
+            path: /
+            pathType: Prefix
+  tls:
+    - hosts:
+        - prometheus.local
+      secretName: certificate-tls-prometheus
--- a/manifests/prometheus/components/ingress/kustomization.yaml
+++ b/manifests/prometheus/components/ingress/kustomization.yaml
@ -0,0 +1,35 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+resources:
+  - Ingress.yaml
+
+
+#############################
+# Items to Configure
+#############################
+
+# patches:
+#   - patch: |-
+#       - op: replace
+#         path: /metadata/annotations/cert-manager.io~1cluster-issuer
+#         value: cluster
+
+#       - op: replace
+#         path: /metadata/annotations/cert-manager.io~1common-name
+#         value: prometheus.local
+
+#       - op: replace
+#         path: /spec/rules/0/host
+#         value: prometheus.local
+
+#       - op: replace
+#         path: /spec/tls/0/hosts/0
+#         value: prometheus.local
+
+#     target:
+#       kind: Ingress
+#       name: prometheus
+#       version: v1
--- a/manifests/prometheus/components/prometheus-rules/PrometheusRule-common.yaml
+++ b/manifests/prometheus/components/prometheus-rules/PrometheusRule-common.yaml
@ -0,0 +1,42 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/instance: cluster
+    app.kubernetes.io/name: prometheus
+    app.kubernetes.io/part-of: monitoring
+  name: common
+spec:
+  groups:
+    - name: common.rules
+      rules:
+        - alert: Watchdog
+          annotations:
+            description: |
+              This is an alert is meant to ensure that the entire alerting pipeline is functional.
+              This alert is always firing, therefore it should always be firing in Alertmanager
+              and always fire against a receiver. There are integrations with various notification
+              mechanisms that send a notification when this alert is not firing. For example the
+              "DeadMansSnitch" integration in PagerDuty.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
+            summary: An alert that should always be firing to certify that Alertmanager is working properly.
+          expr: vector(1)
+          labels:
+            severity: none
+
+        - alert: InfoInhibitor
+          annotations:
+            description: |
+              This is an alert that is used to inhibit info alerts.
+              By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
+              other alerts.
+              This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
+              severity of 'warning' or 'critical' starts firing on the same namespace.
+              This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
+            summary: Info-level alert inhibition.
+          expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
+          labels:
+            severity: none
--- a/manifests/prometheus/components/prometheus-rules/PrometheusRule-prometheus.yaml
+++ b/manifests/prometheus/components/prometheus-rules/PrometheusRule-prometheus.yaml
@ -0,0 +1,277 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    app.kubernetes.io/component: metrics
+    app.kubernetes.io/instance: cluster
+    app.kubernetes.io/name: prometheus
+    app.kubernetes.io/part-of: monitoring
+  name: prometheus-rules
+spec:
+  groups:
+    - name: prometheus
+      rules:
+        - alert: PrometheusBadConfig
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has failed to reload its configuration.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
+            summary: Failed Prometheus configuration reload.
+          expr: |
+            # Without max_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            max_over_time(prometheus_config_last_reload_successful{job="prometheus",namespace="monitoring"}[5m]) == 0
+          for: 10m
+          labels:
+            severity: critical
+        - alert: PrometheusNotificationQueueRunningFull
+          annotations:
+            description: Alert notification queue of Prometheus {{ $labels.namespace}}/{{ $labels.pod}} is running full.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
+            summary: Prometheus alert notification queue predicted to run full in less than 30m.
+          expr: |
+            # Without min_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            (
+              predict_linear(prometheus_notifications_queue_length{job="prometheus",namespace="monitoring"}[5m], 60 * 30)
+            >
+              min_over_time(prometheus_notifications_queue_capacity{job="prometheus",namespace="monitoring"}[5m])
+            )
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
+          annotations:
+            description: '{{  printf "%.1f" $value }}% errors while sending alerts from Prometheus {{ $labels.namespace}}/{{ $labels.pod}} to Alertmanager {{ $labels.alertmanager}}.'
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
+            summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
+          expr: |
+            (
+              rate(prometheus_notifications_errors_total{job="prometheus",namespace="monitoring"}[5m])
+            /
+              rate(prometheus_notifications_sent_total{job="prometheus",namespace="monitoring"}[5m])
+            )
+            * 100
+            > 1
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusNotConnectedToAlertmanagers
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} is not connected to any Alertmanagers.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
+            summary: Prometheus is not connected to any Alertmanagers.
+          expr: |
+            # Without max_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus",namespace="monitoring"}[5m]) < 1
+          for: 10m
+          labels:
+            severity: warning
+        - alert: PrometheusTSDBReloadsFailing
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has detected {{ $value | humanize}} reload failures over the last 3h.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
+            summary: Prometheus has issues reloading blocks from disk.
+          expr: |
+            increase(prometheus_tsdb_reloads_failures_total{job="prometheus",namespace="monitoring"}[3h]) > 0
+          for: 4h
+          labels:
+            severity: warning
+        - alert: PrometheusTSDBCompactionsFailing
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has detected {{ $value | humanize}} compaction failures over the last 3h.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
+            summary: Prometheus has issues compacting blocks.
+          expr: |
+            increase(prometheus_tsdb_compactions_failed_total{job="prometheus",namespace="monitoring"}[3h]) > 0
+          for: 4h
+          labels:
+            severity: warning
+        - alert: PrometheusNotIngestingSamples
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} is not ingesting samples.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
+            summary: Prometheus is not ingesting samples.
+          expr: |
+            (
+              rate(prometheus_tsdb_head_samples_appended_total{job="prometheus",namespace="monitoring"}[5m]) <= 0
+            and
+              (
+                sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus",namespace="monitoring"}) > 0
+              or
+                sum without(rule_group) (prometheus_rule_group_rules{job="prometheus",namespace="monitoring"}) > 0
+              )
+            )
+          for: 10m
+          labels:
+            severity: warning
+        - alert: PrometheusDuplicateTimestamps
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} is dropping {{  printf "%.4g" $value  }} samples/s with different values but duplicated timestamp.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
+            summary: Prometheus is dropping samples with duplicate timestamps.
+          expr: |
+            rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 10m
+          labels:
+            severity: warning
+        - alert: PrometheusOutOfOrderTimestamps
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} is dropping {{  printf "%.4g" $value  }} samples/s with timestamps arriving out of order.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
+            summary: Prometheus drops samples with out-of-order timestamps.
+          expr: |
+            rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 10m
+          labels:
+            severity: warning
+        - alert: PrometheusRemoteStorageFailures
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} failed to send {{  printf "%.1f" $value }}% of the samples to {{  $labels.remote_name}}:{{  $labels.url }}
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
+            summary: Prometheus fails to send samples to remote storage.
+          expr: |
+            (
+              (rate(prometheus_remote_storage_failed_samples_total{job="prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus",namespace="monitoring"}[5m]))
+            /
+              (
+                (rate(prometheus_remote_storage_failed_samples_total{job="prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus",namespace="monitoring"}[5m]))
+              +
+                (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus",namespace="monitoring"}[5m]))
+              )
+            )
+            * 100
+            > 1
+          for: 15m
+          labels:
+            severity: critical
+        - alert: PrometheusRemoteWriteBehind
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} remote write is {{  printf "%.1f" $value }}s behind for {{  $labels.remote_name}}:{{  $labels.url }}.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
+            summary: Prometheus remote write is behind.
+          expr: |
+            # Without max_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            (
+              max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus",namespace="monitoring"}[5m])
+            - ignoring(remote_name, url) group_right
+              max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus",namespace="monitoring"}[5m])
+            )
+            > 120
+          for: 15m
+          labels:
+            severity: critical
+        - alert: PrometheusRemoteWriteDesiredShards
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} remote write desired shards calculation wants to run {{  $value }} shards for queue {{  $labels.remote_name}}:{{  $labels.url }}, which is more than the max of {{  printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus",namespace="monitoring"}` $labels.instance | query | first | value }}.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
+            summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
+          expr: |
+            # Without max_over_time, failed scrapes could create false negatives, see
+            # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+            (
+              max_over_time(prometheus_remote_storage_shards_desired{job="prometheus",namespace="monitoring"}[5m])
+            >
+              max_over_time(prometheus_remote_storage_shards_max{job="prometheus",namespace="monitoring"}[5m])
+            )
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusRuleFailures
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has failed to evaluate {{  printf "%.0f" $value }} rules in the last 5m.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
+            summary: Prometheus is failing rule evaluations.
+          expr: |
+            increase(prometheus_rule_evaluation_failures_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 15m
+          labels:
+            severity: critical
+        - alert: PrometheusMissingRuleEvaluations
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has missed {{  printf "%.0f" $value }} rule group evaluations in the last 5m.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
+            summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
+          expr: |
+            increase(prometheus_rule_group_iterations_missed_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusTargetLimitHit
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has dropped {{  printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
+            summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
+          expr: |
+            increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusLabelLimitHit
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has dropped {{  printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
+            summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
+          expr: |
+            increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusScrapeBodySizeLimitHit
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has failed {{  printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
+            summary: Prometheus has dropped some targets that exceeded body size limit.
+          expr: |
+            increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusScrapeSampleLimitHit
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} has failed {{  printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
+            summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
+          expr: |
+            increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus",namespace="monitoring"}[5m]) > 0
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusTargetSyncFailure
+          annotations:
+            description: '{{  printf "%.0f" $value }} targets in Prometheus {{ $labels.namespace}}/{{ $labels.pod}} have failed to sync because invalid configuration was supplied.'
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
+            summary: Prometheus has failed to sync targets.
+          expr: |
+            increase(prometheus_target_sync_failed_total{job="prometheus",namespace="monitoring"}[30m]) > 0
+          for: 5m
+          labels:
+            severity: critical
+        - alert: PrometheusHighQueryLoad
+          annotations:
+            description: Prometheus {{ $labels.namespace}}/{{ $labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
+            summary: Prometheus is reaching its maximum capacity serving concurrent requests.
+          expr: |
+            avg_over_time(prometheus_engine_queries{job="prometheus",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus",namespace="monitoring"}[5m]) > 0.8
+          for: 15m
+          labels:
+            severity: warning
+        - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
+          annotations:
+            description: '{{  printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{ $labels.namespace}}/{{ $labels.pod}} to any Alertmanager.'
+            runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
+            summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
+          expr: |
+            min without (alertmanager) (
+              rate(prometheus_notifications_errors_total{job="prometheus",namespace="monitoring",alertmanager!~``}[5m])
+            /
+              rate(prometheus_notifications_sent_total{job="prometheus",namespace="monitoring",alertmanager!~``}[5m])
+            )
+            * 100
+            > 3
+          for: 15m
+          labels:
+            severity: critical
--- a/manifests/prometheus/components/prometheus-rules/kustomization.yaml
+++ b/manifests/prometheus/components/prometheus-rules/kustomization.yaml
@ -0,0 +1,8 @@
+---
+apiVersion: kustomize.config.k8s.io/v1alpha1
+kind: Component
+
+
+resources:
+  - PrometheusRule-common.yaml
+  - PrometheusRule-prometheus.yaml
--- a/manifests/prometheus/overlays/production/kustomization.yaml
+++ b/manifests/prometheus/overlays/production/kustomization.yaml
@ -0,0 +1,51 @@
+---
+
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+
+namespace: metrics
+
+
+patches:
+  - target:
+      kind: Prometheus
+      name: cluster
+    patch: |-
+      - op: replace
+        path: /spec/version
+        value: 2.47.0
+#
+# Ingress Setup
+#
+#   - patch: |-
+#       - op: replace
+#         path: /metadata/annotations/cert-manager.io~1cluster-issuer
+#         value: cluster
+
+#       - op: replace
+#         path: /metadata/annotations/cert-manager.io~1common-name
+#         value: prometheus.local
+
+#       - op: replace
+#         path: /spec/rules/0/host
+#         value: prometheus.local
+
+#       - op: replace
+#         path: /spec/tls/0/hosts/0
+#         value: prometheus.local
+
+#     target:
+#       kind: Ingress
+#       name: prometheus
+#       version: v1
+
+
+resources:
+  - ../../base
+
+components:
+  # - ../../components/alertmanager
+  - ../../components/prometheus-rules
+  - ../../components/ingress
+  - ../../components/grafana-datasource