From df126a576d6fb81ae7c4caa07154502b5d119f0f Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 14:58:46 +0930 Subject: [PATCH 01/61] feat: initial template, docs and ci !1 --- .cz.yaml | 7 ++ .gitlab-ci.yml | 35 ++++++++ .gitmodules | 8 ++ .helmignore | 23 ++++++ README.md | 4 + docs/articles/index.md | 0 docs/contact.md | 0 docs/index.md | 0 docs/operations/index.md | 0 docs/projects/index.md | 0 docs/projects/kubernetes_monitoring/index.md | 18 ++++ docs/tags.md | 0 docs/task-doc-template.md | 87 ++++++++++++++++++++ gitlab-ci | 1 + mkdocs.yml | 30 +++++++ website-template | 1 + 16 files changed, 214 insertions(+) create mode 100644 .cz.yaml create mode 100644 .gitlab-ci.yml create mode 100644 .gitmodules create mode 100644 .helmignore create mode 100644 docs/articles/index.md create mode 100644 docs/contact.md create mode 100644 docs/index.md create mode 100644 docs/operations/index.md create mode 100644 docs/projects/index.md create mode 100644 docs/projects/kubernetes_monitoring/index.md create mode 100644 docs/tags.md create mode 100644 docs/task-doc-template.md create mode 160000 gitlab-ci create mode 100644 mkdocs.yml create mode 160000 website-template diff --git a/.cz.yaml b/.cz.yaml new file mode 100644 index 0000000..62b2ada --- /dev/null +++ b/.cz.yaml @@ -0,0 +1,7 @@ +commitizen: + bump_message: "build(version): bump version $current_version \u2192 $new_version" + changelog_incremental: false + name: cz_conventional_commits + tag_format: $major.$minor.$patch$prerelease + update_changelog_on_bump: true + version: 0.0.1 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..a596569 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,35 @@ +variables: + MY_PROJECT_ID: "50510268" + GIT_SYNC_URL: "https://$GITHUB_USERNAME_ROBOT:$GITHUB_TOKEN_ROBOT@github.com/NoFussComputing/docker-glpi.git" + + + PAGES_ENVIRONMENT_PATH: projects/kubernetes_monitoring/ + +include: + - project: nofusscomputing/projects/gitlab-ci + ref: development + file: + - template/automagic.gitlab-ci.yaml + #- template: Jobs/Container-Scanning.gitlab-ci.yml # see https://gitlab.com/gitlab-org/gitlab/-/issues/381665 + + +Website.Submodule.Deploy: + extends: .submodule_update_trigger + variables: + SUBMODULE_UPDATE_TRIGGER_PROJECT: nofusscomputing/infrastructure/website + environment: + url: https://nofusscomputing.com/$PAGES_ENVIRONMENT_PATH + name: Documentation + rules: + - if: # condition_dev_branch_push + $CI_COMMIT_BRANCH == "development" && + $CI_PIPELINE_SOURCE == "push" + exists: + - '{docs/**,pages/**}/*.md' + changes: + paths: + - '{docs/**,pages/**}/*.md' + compare_to: 'master' + when: always + + - when: never diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..de6bfdd --- /dev/null +++ b/.gitmodules @@ -0,0 +1,8 @@ +[submodule "gitlab-ci"] + path = gitlab-ci + url = https://gitlab.com/nofusscomputing/projects/gitlab-ci.git + branch = development +[submodule "website-template"] + path = website-template + url = https://gitlab.com/nofusscomputing/infrastructure/website-template.git + branch = development diff --git a/.helmignore b/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/README.md b/README.md index 0732e82..cb9ad14 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,10 @@ All contributions for this project must conducted from [Gitlab](https://gitlab.c For further details on contributing please refer to the [contribution guide](CONTRIBUTING.md). +## Documentation + +Documentation for this project can be found at + ## Other This repo is release under this [license](LICENSE) diff --git a/docs/articles/index.md b/docs/articles/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/contact.md b/docs/contact.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/operations/index.md b/docs/operations/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/projects/index.md b/docs/projects/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md new file mode 100644 index 0000000..f7f07fc --- /dev/null +++ b/docs/projects/kubernetes_monitoring/index.md @@ -0,0 +1,18 @@ +--- +title: Kubernetes Monitoring Helm Chart +description: How to use No Fuss Computings kubernetes monitoring helm chart for full stack monitoring. +date: 2023-08-29 +template: project.html +about: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring +--- + + +## Running the container + +To quickly setup a container the following `docker-compose.yaml` file could be used. + +``` yaml title="docker-compose.yaml" linenums="1" + + + +``` diff --git a/docs/tags.md b/docs/tags.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/task-doc-template.md b/docs/task-doc-template.md new file mode 100644 index 0000000..ade9fcd --- /dev/null +++ b/docs/task-doc-template.md @@ -0,0 +1,87 @@ + + +short summary of the task file + +## {Task Name} + +- **Name**: + +- **Description**: + +- **Module**: + +- **Arguments**: + + - + +- **Conditional**: + +- **Tags**: + + - + +## {Task Name} + +- **Name**: + +- **Description**: + +- **Module**: + +- **Arguments**: + + - + +- **Registers**: + +- **Conditional**: + +- **Tags**: + + - + + +## Variables + +The following variables can be customized in this task file: + +```yaml +variable_name: "default_value" +``` + +- `variable_name`: Description of the variable. + +## Tags + +The tasks in this task file are tagged with the following tags: + +- + +## Usage + +To use this Ansible task file, you can include it in your playbook or role and provide values for the required variables. Here's an example of how you can use this task file: + +1. Create a playbook (e.g., `your_playbook.yaml`) and define the necessary variables: + +```yaml +--- + +- hosts: your_hosts + vars: + variable_name: "value" + + tasks: + - include_tasks: path/to/task_file.yaml +``` + +2. Create a separate file for the task file (e.g., `task_file.yaml`) and copy the content of the task file into it. + +3. Run the playbook: + +```shell +ansible-playbook your_playbook.yaml +``` + +Make sure to replace the placeholder values (`variable_name`, `value`) with the appropriate values for your setup. + +Note: You may need to adjust the playbook structure and additional tasks based on your specific requirements and the tasks you want to execute. \ No newline at end of file diff --git a/gitlab-ci b/gitlab-ci new file mode 160000 index 0000000..a5a9fa4 --- /dev/null +++ b/gitlab-ci @@ -0,0 +1 @@ +Subproject commit a5a9fa44374107657b2587ce52607d96a825be56 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..355d4fb --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,30 @@ +INHERIT: website-template/mkdocs.yml + +docs_dir: 'docs' + +repo_name: Kubernetes Monitoring Helm Chart +repo_url: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring +edit_uri: '/-/ide/project/nofusscomputing/projects/kubernetes_monitoring/edit/development/-/docs/' + +nav: +- Home: index.md + +- Articles: + + - articles/index.md + +- Projects: + + - projects/index.md + + - Kubernetes Monitoring: + + - projects/kubernetes_monitoring/index.md + + +- Operations: + + - operations/index.md + +- Contact Us: contact.md + diff --git a/website-template b/website-template new file mode 160000 index 0000000..992b548 --- /dev/null +++ b/website-template @@ -0,0 +1 @@ +Subproject commit 992b54805b8b6c78a3d2a5ea7de71c7be2b070c8 From 0feb8794b2bcc41357ff8d8bf30f1856eb193e7d Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:00:58 +0930 Subject: [PATCH 02/61] ci: typo in github url !1 --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a596569..511c223 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ variables: MY_PROJECT_ID: "50510268" - GIT_SYNC_URL: "https://$GITHUB_USERNAME_ROBOT:$GITHUB_TOKEN_ROBOT@github.com/NoFussComputing/docker-glpi.git" + GIT_SYNC_URL: "https://$GITHUB_USERNAME_ROBOT:$GITHUB_TOKEN_ROBOT@github.com/NoFussComputing/kubernetes_monitoring.git" PAGES_ENVIRONMENT_PATH: projects/kubernetes_monitoring/ From 286d2827ad1c30f95814494425bcda96a8a9874d Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:05:49 +0930 Subject: [PATCH 03/61] feat: grafana deployment !1 --- templates/ConfigMap-Grafana.yaml | 130 ++++++++++++++++++++++++++ templates/Grafana-Grafana.yaml | 130 ++++++++++++++++++++++++++ templates/Service-Grafana.yaml | 30 ++++++ templates/ServiceAccount-Grafana.yaml | 14 +++ templates/ServiceMonitor-Grafana.yaml | 37 ++++++++ 5 files changed, 341 insertions(+) create mode 100644 templates/ConfigMap-Grafana.yaml create mode 100644 templates/Grafana-Grafana.yaml create mode 100644 templates/Service-Grafana.yaml create mode 100644 templates/ServiceAccount-Grafana.yaml create mode 100644 templates/ServiceMonitor-Grafana.yaml diff --git a/templates/ConfigMap-Grafana.yaml b/templates/ConfigMap-Grafana.yaml new file mode 100644 index 0000000..eeb28f4 --- /dev/null +++ b/templates/ConfigMap-Grafana.yaml @@ -0,0 +1,130 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-config + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +data: + prometheus.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access":"proxy", + "editable": true, + "name": "prometheus", + "orgId": 1, + "type": "prometheus", + "url": "http://prometheus-config-map.monitoring.svc:9090", + "version": 1 + } + ] + } + + oncall-app.yaml: |- + apiVersion: 1 + apps: + - type: grafana-oncall-app + name: grafana-oncall-app + disabled: false + jsonData: + #user: admin + grafanaUrl: http://grafana.monitoring.svc:3000 + onCallApiUrl: http://oncall.monitoring.svc:8080 + secureJsonData: + #password: admin + onCallInvitationToken: yaD3a6dkYV0CFinTlMPb42Mi9 + #http://127.0.0.1:8080/integrations/v1/alertmanager/yaD3a6dkYV0CFinTlMPb42Mi9/ + + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana-alertmanager + namespace: monitoring +data: + alertmanager.yaml: |- + { + "template_files": {}, + "template_file_provenances": {}, + "alertmanager_config": { + "route": { + "receiver": "Grafana Alerting 😊", + "group_by": [ + "grafana_folder", + "alertname" + ], + "routes": [ + { + "receiver": "Grafana Alerting 😊", + "object_matchers": [ + [ + "severity", + "!=", + "" + ], + [ + "severity", + "=", + "critical" + ], + [ + "severity", + "=", + "info" + ], + [ + "severity", + "=", + "warning" + ] + ], + "continue": true, + "group_wait": "1s", + "group_interval": "2s", + "repeat_interval": "15m" + } + ], + "group_wait": "1s", + "group_interval": "2s", + "repeat_interval": "3m" + }, + "templates": null, + "receivers": [ + { + "name": "Grafana Alerting 😊", + "grafana_managed_receiver_configs": [ + { + "uid": "4dqvJSfVkz", + "name": "Grafana Alerting 😊", + "type": "webhook", + "disableResolveMessage": false, + "settings": { + "httpMethod": "POST", + "url": "http://host.docker.internal:8080/integrations/v1/grafana_alerting/dXStYoK9Z15VZW8R8AtfyIwtu/" + }, + "secureFields": {} + } + ] + } + ] + } + } + +--- \ No newline at end of file diff --git a/templates/Grafana-Grafana.yaml b/templates/Grafana-Grafana.yaml new file mode 100644 index 0000000..f0aca0f --- /dev/null +++ b/templates/Grafana-Grafana.yaml @@ -0,0 +1,130 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" +spec: + config: + log: + mode: "console" + auth: + disable_login_form: "false" + security: + admin_user: "{{ .Values.nfc_monitoring.grafana.admin_user }}" + admin_password: "{{ .Values.nfc_monitoring.grafana.admin_password }}" + deployment: + metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + replicas: {{ .Values.nfc_monitoring.grafana.replicas | int }} + selector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + + spec: + containers: + - name: grafana + image: "{{ .Values.nfc_monitoring.grafana.image.name }}:{{ .Values.nfc_monitoring.grafana.image.tag }}" + env: + - name: GF_INSTALL_PLUGINS + value: agenty-flowcharting-panel 0.9.1, ddurieux-glpi-app 1.3.1, grafana-oncall-app + - name: GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS + value: grafana-oncall-app 1.1.38 + - name: GF_ANALYTICS_REPORTING_ENABLED + value: 'false' + imagePullPolicy: IfNotPresent + ports: + - containerPort: 3000 + name: grafana-http + readinessProbe: + failureThreshold: 3 + resources: + limits: + cpu: 2000m + memory: 1Gi + requests: + cpu: 100m + memory: 100Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false + volumeMounts: + - mountPath: /var/lib/grafana + name: grafana-storage + - mountPath: /etc/grafana/provisioning/plugins + name: plugin-config + readOnly: false + volumes: + - name: grafana-storage + emptyDir: {} + - name: plugin-config + configMap: + # Provide the name of the ConfigMap you want to mount. + name: grafana-config + # An array of keys from the ConfigMap to create as files + items: + - key: "oncall-app.yaml" + path: "oncall-app.yaml" + # - volumeClaimTemplates: + # - metadata: + # name: data + # labels: + # app.kubernetes.io/name: loki + # app.kubernetes.io/component: logging + # app.kubernetes.io/part-of: {{ $.Chart.Name }} + # app.kubernetes.io/version: {{ $.Chart.Version }} + # app.kubernetes.io/managed-by: {{ $.Release.Service }} + # spec: + # accessModes: + # - "ReadWriteOnce" + # resources: + # requests: + # storage: "5Gi" + serviceAccountName: grafana + nodeSelector: + kubernetes.io/os: linux + + # persistentVolumeClaim: + # metadata: + # labels: + # app.kubernetes.io/component: graphing + # app.kubernetes.io/instance: k8s + # app.kubernetes.io/name: grafana + # app.kubernetes.io/managed-by: {{ $.Release.Service }} + # app.kubernetes.io/part-of: {{ $.Chart.Name }} + # app.kubernetes.io/version: {{ $.Chart.Version }} + # spec: + # resources: + # requests: + # storage: "5Gi" \ No newline at end of file diff --git a/templates/Service-Grafana.yaml b/templates/Service-Grafana.yaml new file mode 100644 index 0000000..019adc1 --- /dev/null +++ b/templates/Service-Grafana.yaml @@ -0,0 +1,30 @@ +--- + +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + selector: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} + #type: NodePort + #type: LoadBalancer + #clusterIP: None + ports: + - name: grafana + port: 3000 + targetPort: grafana-http + #nodePort: 3000 + #type: LoadBalancer + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-Grafana.yaml b/templates/ServiceAccount-Grafana.yaml new file mode 100644 index 0000000..41eeb9e --- /dev/null +++ b/templates/ServiceAccount-Grafana.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + name: grafana + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} diff --git a/templates/ServiceMonitor-Grafana.yaml b/templates/ServiceMonitor-Grafana.yaml new file mode 100644 index 0000000..8a7bdb9 --- /dev/null +++ b/templates/ServiceMonitor-Grafana.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: grafana + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_instance + targetLabel: instance + scheme: http + # tlsConfig: + # insecureSkipVerify: true + targetLabels: + - cluster + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 8fff819081bf0b274f0afe7fe9f5cee6cd85ef18 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:07:49 +0930 Subject: [PATCH 04/61] feat: prometheus deployment !1 --- templates/ClusterRole-binding-prometheus.yaml | 20 ++ templates/ClusterRole-prometheus.yaml | 23 ++ templates/ClusterRoleBinding-prometheus.yaml | 20 ++ templates/GrafanaDatasource-Prometheus.yaml | 38 +++ templates/PodDisruptionBudget-prometheus.yaml | 20 ++ templates/Prometheus-prometheus.yaml | 50 +++ templates/PrometheusRule-prometheus.yaml | 281 +++++++++++++++++ .../Role-SpecificNamespaces-prometheus.yaml | 290 ++++++++++++++++++ templates/RoleBinding-Config-prometheus.yaml | 21 ++ ...Binding-SpecificNamespaces-prometheus.yaml | 251 +++++++++++++++ templates/RoleConfig-prometheus.yaml | 20 ++ templates/Service-prometheus.yaml | 27 ++ templates/ServiceAccount-prometheus.yaml | 14 + templates/ServiceMonitor-prometheus.yaml | 24 ++ 14 files changed, 1099 insertions(+) create mode 100644 templates/ClusterRole-binding-prometheus.yaml create mode 100644 templates/ClusterRole-prometheus.yaml create mode 100644 templates/ClusterRoleBinding-prometheus.yaml create mode 100644 templates/GrafanaDatasource-Prometheus.yaml create mode 100644 templates/PodDisruptionBudget-prometheus.yaml create mode 100644 templates/Prometheus-prometheus.yaml create mode 100644 templates/PrometheusRule-prometheus.yaml create mode 100644 templates/Role-SpecificNamespaces-prometheus.yaml create mode 100644 templates/RoleBinding-Config-prometheus.yaml create mode 100644 templates/RoleBinding-SpecificNamespaces-prometheus.yaml create mode 100644 templates/RoleConfig-prometheus.yaml create mode 100644 templates/Service-prometheus.yaml create mode 100644 templates/ServiceAccount-prometheus.yaml create mode 100644 templates/ServiceMonitor-prometheus.yaml diff --git a/templates/ClusterRole-binding-prometheus.yaml b/templates/ClusterRole-binding-prometheus.yaml new file mode 100644 index 0000000..4d6bc54 --- /dev/null +++ b/templates/ClusterRole-binding-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-adapter +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/templates/ClusterRole-prometheus.yaml b/templates/ClusterRole-prometheus.yaml new file mode 100644 index 0000000..151fe77 --- /dev/null +++ b/templates/ClusterRole-prometheus.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s +rules: +- apiGroups: + - "" + resources: + - nodes/metrics + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/templates/ClusterRoleBinding-prometheus.yaml b/templates/ClusterRoleBinding-prometheus.yaml new file mode 100644 index 0000000..5468c07 --- /dev/null +++ b/templates/ClusterRoleBinding-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/templates/GrafanaDatasource-Prometheus.yaml b/templates/GrafanaDatasource-Prometheus.yaml new file mode 100644 index 0000000..3a44401 --- /dev/null +++ b/templates/GrafanaDatasource-Prometheus.yaml @@ -0,0 +1,38 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: prometheus + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + #secrets: + # - credentials + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + allowCrossNamespaceImport: true + datasource: + name: prometheus + type: prometheus + access: proxy + # basicAuth: true + url: "http://prometheus-k8s.{{ .Values.nfc_monitoring.prometheus.namespace }}.svc:9090" + isDefault: true + # user: admin + jsonData: + # "tlsSkipVerify": true + manageAlerts: true + "timeInterval": "5s" + "orgId": 1 + # secureJsonData: + # "password": admin + editable: true \ No newline at end of file diff --git a/templates/PodDisruptionBudget-prometheus.yaml b/templates/PodDisruptionBudget-prometheus.yaml new file mode 100644 index 0000000..9ee858e --- /dev/null +++ b/templates/PodDisruptionBudget-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/Prometheus-prometheus.yaml b/templates/Prometheus-prometheus.yaml new file mode 100644 index 0000000..25a08af --- /dev/null +++ b/templates/Prometheus-prometheus.yaml @@ -0,0 +1,50 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: k8s + namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}" +spec: + alerting: + alertmanagers: + - apiVersion: v2 + name: alertmanager-main + namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + port: web + enableFeatures: [] + externalLabels: {} + image: quay.io/prometheus/prometheus:v2.42.0 + nodeSelector: + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + podMonitorNamespaceSelector: {} + podMonitorSelector: {} + probeNamespaceSelector: {} + probeSelector: {} + replicas: 3 + resources: + requests: + memory: 400Mi + ruleNamespaceSelector: {} + ruleSelector: {} + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: prometheus-k8s + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + version: 2.42.0 diff --git a/templates/PrometheusRule-prometheus.yaml b/templates/PrometheusRule-prometheus.yaml new file mode 100644 index 0000000..e5f0d5d --- /dev/null +++ b/templates/PrometheusRule-prometheus.yaml @@ -0,0 +1,281 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: prometheus-k8s-prometheus-rules + namespace: monitoring +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to reload its configuration. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is running full. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ `{{` }} printf "%.1f" $value }}% errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to Alertmanager {{ `{{` }}$labels.alertmanager}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not connected to any Alertmanagers. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} reload failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} compaction failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not ingesting samples. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with timestamps arriving out of order. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} failed to send {{ `{{` }} printf "%.1f" $value }}% of the samples to {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write is {{ `{{` }} printf "%.1f" $value }}s behind for {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write desired shards calculation wants to run {{ `{{` }} $value }} shards for queue {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}, which is more than the max of {{ `{{` }} printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to evaluate {{ `{{` }} printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has missed {{ `{{` }} printf "%.0f" $value }} rule group evaluations in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusScrapeBodySizeLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit + summary: Prometheus has dropped some targets that exceeded body size limit. + expr: | + increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusScrapeSampleLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit + summary: Prometheus has failed scrapes that have exceeded the configured sample limit. + expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{ `{{` }} printf "%.0f" $value }} targets in Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} have failed to sync because invalid configuration was supplied.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure + summary: Prometheus has failed to sync targets. + expr: | + increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0 + for: 5m + labels: + severity: critical + - alert: PrometheusHighQueryLoad + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload + summary: Prometheus is reaching its maximum capacity serving concurrent requests. + expr: | + avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.8 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ `{{` }} printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to any Alertmanager.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical diff --git a/templates/Role-SpecificNamespaces-prometheus.yaml b/templates/Role-SpecificNamespaces-prometheus.yaml new file mode 100644 index 0000000..dd91da8 --- /dev/null +++ b/templates/Role-SpecificNamespaces-prometheus.yaml @@ -0,0 +1,290 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +items: +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: default + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: kube-system + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: monitoring + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + + + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: ceph + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: loki + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: grafana + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + + + + + + +kind: RoleList diff --git a/templates/RoleBinding-Config-prometheus.yaml b/templates/RoleBinding-Config-prometheus.yaml new file mode 100644 index 0000000..06b9035 --- /dev/null +++ b/templates/RoleBinding-Config-prometheus.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s-config + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s-config +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml new file mode 100644 index 0000000..b157aa7 --- /dev/null +++ b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml @@ -0,0 +1,251 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +items: +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: default + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: kube-system + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + + + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: monitoring + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: ceph + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: loki + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: grafana + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + + + + + + + +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: kube-metrics + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: kube-dashboard + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: olm + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: operators + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring + + + + + + + + + + + + +kind: RoleBindingList diff --git a/templates/RoleConfig-prometheus.yaml b/templates/RoleConfig-prometheus.yaml new file mode 100644 index 0000000..28da8b3 --- /dev/null +++ b/templates/RoleConfig-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s-config + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get diff --git a/templates/Service-prometheus.yaml b/templates/Service-prometheus.yaml new file mode 100644 index 0000000..5a6d35e --- /dev/null +++ b/templates/Service-prometheus.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: monitoring +spec: + ports: + - name: web + port: 9090 + targetPort: web + - name: reloader-web + port: 8080 + targetPort: reloader-web + selector: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-prometheus.yaml b/templates/ServiceAccount-prometheus.yaml new file mode 100644 index 0000000..d1f3a21 --- /dev/null +++ b/templates/ServiceAccount-prometheus.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: true +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: monitoring diff --git a/templates/ServiceMonitor-prometheus.yaml b/templates/ServiceMonitor-prometheus.yaml new file mode 100644 index 0000000..3512aca --- /dev/null +++ b/templates/ServiceMonitor-prometheus.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: web + - interval: 30s + port: reloader-web + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 5d808ee753afff17373dd8c4232b00bbf8904f16 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:09:20 +0930 Subject: [PATCH 05/61] feat: prometheus adaptor deployment !1 --- templates/APIService-prometheus-adapter.yaml | 21 ++++ ...ClusterRole-aggregated-metrics-reader.yaml | 26 +++++ ...-binding-delegator-prometheus-adaptor.yaml | 20 ++++ ...hpa-custom-metrics-prometheus-adaptor.yaml | 19 ++++ ...s-server-resources-prometheus-adaptor.yaml | 18 ++++ templates/ClusterRole-prometheus-adapter.yaml | 24 +++++ templates/ConfigMap-prometheus-adapter.yaml | 72 +++++++++++++ templates/Deployment-prometheus-adapter.yaml | 100 ++++++++++++++++++ ...odDisruptionBudget-prometheus-adapter.yaml | 21 ++++ ...inding-prometheus-adapter-auth-reader.yaml | 18 ++++ templates/Service-prometheus-adapter.yaml | 23 ++++ .../ServiceAccount-prometheus-adapter.yaml | 14 +++ .../ServiceMonitor-prometheus-adapter.yaml | 39 +++++++ 13 files changed, 415 insertions(+) create mode 100644 templates/APIService-prometheus-adapter.yaml create mode 100644 templates/ClusterRole-aggregated-metrics-reader.yaml create mode 100644 templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml create mode 100644 templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml create mode 100644 templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml create mode 100644 templates/ClusterRole-prometheus-adapter.yaml create mode 100644 templates/ConfigMap-prometheus-adapter.yaml create mode 100644 templates/Deployment-prometheus-adapter.yaml create mode 100644 templates/PodDisruptionBudget-prometheus-adapter.yaml create mode 100644 templates/RoleBinding-prometheus-adapter-auth-reader.yaml create mode 100644 templates/Service-prometheus-adapter.yaml create mode 100644 templates/ServiceAccount-prometheus-adapter.yaml create mode 100644 templates/ServiceMonitor-prometheus-adapter.yaml diff --git a/templates/APIService-prometheus-adapter.yaml b/templates/APIService-prometheus-adapter.yaml new file mode 100644 index 0000000..04d7bbf --- /dev/null +++ b/templates/APIService-prometheus-adapter.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: v1beta1.metrics.k8s.io +spec: + group: metrics.k8s.io + groupPriorityMinimum: 100 + insecureSkipTLSVerify: true + service: + name: prometheus-adapter + namespace: monitoring + version: v1beta1 + versionPriority: 100 diff --git a/templates/ClusterRole-aggregated-metrics-reader.yaml b/templates/ClusterRole-aggregated-metrics-reader.yaml new file mode 100644 index 0000000..bc2afd0 --- /dev/null +++ b/templates/ClusterRole-aggregated-metrics-reader.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" + rbac.authorization.k8s.io/aggregate-to-view: "true" + name: system:aggregated-metrics-reader + namespace: monitoring +rules: +- apiGroups: + - metrics.k8s.io + resources: + - pods + - nodes + verbs: + - get + - list + - watch diff --git a/templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml b/templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml new file mode 100644 index 0000000..5648792 --- /dev/null +++ b/templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: resource-metrics:system:auth-delegator + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml b/templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml new file mode 100644 index 0000000..4785870 --- /dev/null +++ b/templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: hpa-controller-custom-metrics + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: custom-metrics-server-resources +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system diff --git a/templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml b/templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml new file mode 100644 index 0000000..b5d028b --- /dev/null +++ b/templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: resource-metrics-server-resources +rules: +- apiGroups: + - metrics.k8s.io + resources: + - '*' + verbs: + - '*' diff --git a/templates/ClusterRole-prometheus-adapter.yaml b/templates/ClusterRole-prometheus-adapter.yaml new file mode 100644 index 0000000..87cd65f --- /dev/null +++ b/templates/ClusterRole-prometheus-adapter.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter +rules: +- apiGroups: + - "" + resources: + - nodes + - namespaces + - pods + - services + verbs: + - get + - list + - watch diff --git a/templates/ConfigMap-prometheus-adapter.yaml b/templates/ConfigMap-prometheus-adapter.yaml new file mode 100644 index 0000000..2f1a140 --- /dev/null +++ b/templates/ConfigMap-prometheus-adapter.yaml @@ -0,0 +1,72 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: adapter-config + namespace: monitoring +data: + config.yaml: |- + "resourceRules": + "cpu": + "containerLabel": "container" + "containerQuery": | + sum by (<<.GroupBy>>) ( + irate ( + container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s] + ) + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + 1 - irate( + node_cpu_seconds_total{mode="idle"}[60s] + ) + * on(namespace, pod) group_left(node) ( + node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>} + ) + ) + or sum by (<<.GroupBy>>) ( + 1 - irate( + windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m] + ) + ) + "resources": + "overrides": + "namespace": + "resource": "namespace" + "node": + "resource": "node" + "pod": + "resource": "pod" + "memory": + "containerLabel": "container" + "containerQuery": | + sum by (<<.GroupBy>>) ( + container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""} + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} + - + node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>} + ) + or sum by (<<.GroupBy>>) ( + windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>} + - + windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>} + ) + "resources": + "overrides": + "instance": + "resource": "node" + "namespace": + "resource": "namespace" + "pod": + "resource": "pod" + "window": "5m" diff --git a/templates/Deployment-prometheus-adapter.yaml b/templates/Deployment-prometheus-adapter.yaml new file mode 100644 index 0000000..9edd8bc --- /dev/null +++ b/templates/Deployment-prometheus-adapter.yaml @@ -0,0 +1,100 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: "{{ .Values.nfc_monitoring.prometheus_adaptor.namespace }}" +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + automountServiceAccountToken: true + containers: + - args: + - --cert-dir=/var/run/serving-cert + - --config=/etc/adapter/config.yaml + - --metrics-relist-interval=1m + - --prometheus-url=https://prometheus.monitoring.svc:9090/ + - --secure-port=6443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA + image: "{{ .Values.nfc_monitoring.prometheus_adaptor.image.name }}:{{ .Values.nfc_monitoring.prometheus_adaptor.image.tag }}" + livenessProbe: + failureThreshold: 5 + httpGet: + path: /livez + port: https + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + name: prometheus-adapter + ports: + - containerPort: 6443 + name: https + readinessProbe: + failureThreshold: 5 + httpGet: + path: /readyz + port: https + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + resources: + requests: + cpu: 102m + memory: 180Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /tmp + name: tmpfs + readOnly: false + - mountPath: /var/run/serving-cert + name: volume-serving-cert + readOnly: false + - mountPath: /etc/adapter + name: config + readOnly: false + nodeSelector: + kubernetes.io/os: linux + securityContext: {} + serviceAccountName: prometheus-adapter + volumes: + - emptyDir: {} + name: tmpfs + - emptyDir: {} + name: volume-serving-cert + - configMap: + name: adapter-config + name: config diff --git a/templates/PodDisruptionBudget-prometheus-adapter.yaml b/templates/PodDisruptionBudget-prometheus-adapter.yaml new file mode 100644 index 0000000..17203c2 --- /dev/null +++ b/templates/PodDisruptionBudget-prometheus-adapter.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/RoleBinding-prometheus-adapter-auth-reader.yaml b/templates/RoleBinding-prometheus-adapter-auth-reader.yaml new file mode 100644 index 0000000..be46bee --- /dev/null +++ b/templates/RoleBinding-prometheus-adapter-auth-reader.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/version: 0.11.1 + name: resource-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/templates/Service-prometheus-adapter.yaml b/templates/Service-prometheus-adapter.yaml new file mode 100644 index 0000000..406fd70 --- /dev/null +++ b/templates/Service-prometheus-adapter.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +spec: + ports: + - name: https + port: 443 + targetPort: 6443 + selector: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceAccount-prometheus-adapter.yaml b/templates/ServiceAccount-prometheus-adapter.yaml new file mode 100644 index 0000000..ff50c47 --- /dev/null +++ b/templates/ServiceAccount-prometheus-adapter.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring diff --git a/templates/ServiceMonitor-prometheus-adapter.yaml b/templates/ServiceMonitor-prometheus-adapter.yaml new file mode 100644 index 0000000..0270e70 --- /dev/null +++ b/templates/ServiceMonitor-prometheus-adapter.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*) + sourceLabels: + - __name__ + port: https + scheme: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_instance + targetLabel: instance + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 3865028541d6023f380745a2549654c651431a1b Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:09:59 +0930 Subject: [PATCH 06/61] feat(grafana): prometheus rules !1 --- templates/PrometheusRule-grafana.yaml | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 templates/PrometheusRule-grafana.yaml diff --git a/templates/PrometheusRule-grafana.yaml b/templates/PrometheusRule-grafana.yaml new file mode 100644 index 0000000..71f3d0d --- /dev/null +++ b/templates/PrometheusRule-grafana.yaml @@ -0,0 +1,35 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: grafana-rules + namespace: monitoring +spec: + groups: + - name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + annotations: + message: '{{`{{`}} $labels.namespace }}/{{`{{`}} $labels.job }}/{{`{{`}} $labels.handler }} is experiencing {{`{{`}} $value | humanize }}% errors' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing + expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > 50 + for: 5m + labels: + severity: warning + - name: grafana_rules + rules: + - expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m From e0bc34c12ff88b8e75806dc238b00e1ef464129b Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:12:04 +0930 Subject: [PATCH 07/61] feat: grafana agent deployment !1 --- templates/ClusterRole-GrafanaAgent.yaml | 42 +++ .../ClusterRoleBinding-Grafana-Agent.yaml | 20 ++ templates/ConfigMap-GrafanaAgent.yaml | 281 ++++++++++++++++++ templates/Daemonset-GrafanaAgent.yaml | 136 +++++++++ templates/PrometheusRule-grafana-agent.yaml | 23 ++ templates/Service-GrafanaAgent.yaml | 28 ++ templates/ServiceAccount-GrafanaAgent.yaml | 14 + 7 files changed, 544 insertions(+) create mode 100644 templates/ClusterRole-GrafanaAgent.yaml create mode 100644 templates/ClusterRoleBinding-Grafana-Agent.yaml create mode 100644 templates/ConfigMap-GrafanaAgent.yaml create mode 100644 templates/Daemonset-GrafanaAgent.yaml create mode 100644 templates/PrometheusRule-grafana-agent.yaml create mode 100644 templates/Service-GrafanaAgent.yaml create mode 100644 templates/ServiceAccount-GrafanaAgent.yaml diff --git a/templates/ClusterRole-GrafanaAgent.yaml b/templates/ClusterRole-GrafanaAgent.yaml new file mode 100644 index 0000000..a5886ab --- /dev/null +++ b/templates/ClusterRole-GrafanaAgent.yaml @@ -0,0 +1,42 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: grafana-agent + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + - events + verbs: + - get + - list + - watch +- nonResourceURLs: + - /metrics + verbs: + - get +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/templates/ClusterRoleBinding-Grafana-Agent.yaml b/templates/ClusterRoleBinding-Grafana-Agent.yaml new file mode 100644 index 0000000..403db42 --- /dev/null +++ b/templates/ClusterRoleBinding-Grafana-Agent.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: grafana-agent +subjects: +- kind: ServiceAccount + name: grafana-agent + namespace: monitoring diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml new file mode 100644 index 0000000..d866aee --- /dev/null +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -0,0 +1,281 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana-agent + namespace: monitoring +data: + agent.yaml: | + metrics: + wal_directory: /tmp/wal + + logs: + # Choose a directory to save the last read position of log files at. + # This directory will be created if it doesn't already exist. + positions_directory: "/tmp" + + configs: + + - name: journal + clients: + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:3100/loki/api/v1/push + scrape_configs: + + - job_name: systemd-journal + journal: + labels: + job: node-journal + path: /host/root/run/log/journal + json: true + relabel_configs: + + - source_labels: + - __journal__systemd_unit + target_label: systemd_unit + - source_labels: + - __journal__hostname + target_label: node + - source_labels: + - __journal_syslog_identifier + target_label: syslog_identifier + - target_label: "job_name" + replacement: "journal" + + pipeline_stages: + - json: + expressions: + pid: _PID + userId: _UID + application: _COMM + priority: PRIORITY + + - labels: + application: + #level: + pid: + userId: + priority: + - template: + source: level + template: '{{"{{"}} ToLower .Value {{"}}"}}' + - match: + selector: '{priority="7"}' + stages: + - template: + source: level + template: 'debug' + - match: + selector: '{priority="6"}' + stages: + - template: + source: level + template: 'info' + - match: + selector: '{priority="5"}' + stages: + - template: + source: level + template: 'notice' + - match: + selector: '{priority="4"}' + stages: + - template: + source: level + template: 'warning' + - match: + selector: '{priority="3"}' + stages: + - template: + source: level + template: 'error' + - match: + selector: '{priority="2"}' + stages: + - template: + source: level + template: 'crit' + - match: + selector: '{priority="1"}' + stages: + - template: + source: level + template: 'alert' + - match: + selector: '{priority="0"}' + stages: + - template: + source: level + template: 'emerg' + - labels: + level: + + # - job_name: varlogs + # static_configs: + # - targets: [localhost] + # labels: + # # cluster: dev + # job: 'container logs' + # __path__: /var/log/pods/*/*/*.log + # pipeline_stages: + # - json: + # expressions: + # namespace: namespace + # level: level + # - regex: + # source: filename + # # expression: '/var/log/pods/(.+)/(?P.+)/(*.log)' + # # expression: '/var/log/pods/.+/(?P\\S+?)' + # #expression: '/var/log/pods/.+/(?P\\S+?)/*.log' + # #expression: '/var/log/pods/.+/(?P\S+?)/*.log' + # expression: '/var/log/pods/.+/(?P\S+?)/.+log' + # - regex: + # source: filename + # expression: '/var\/log\/pods\/.*_(?P\S+?)_.*\/.+\/.+log' + # - regex: + # source: filename + # expression: '/var/log/pods/(?P\S+?)_.*/.+/.+log' + # - regex: + # #source: message + # expression: '\s?level=(?P\S+)\s+' + # - regex: + # #source: message + # expression: '\s?(?Pdebug|info|warn|error)\s' + # - template: + # source: level + # template: '{{"{{"}} ToLower .Value {{"}}"}}' + # - labels: + # container: + # level: + # namespace: + # pod: + #relabel_configs: + + # - source_labels: [__filename__] + # separator: '/' + # regex: '/var/log/pods/*/(.*)/*.log' + # replacement: '${1}' + # target_label: pod + # - source_labels: [__filename__] + # replacement: '/var/log/pods/*/${1}/*.log' + # target_label: pod + # - replacement: /var/log/pods/*/${1}/*.log + # separator: / + # source_labels: + # - __meta_kubernetes_pod_uid + # - __meta_kubernetes_pod_container_name + # target_label: pod + + + + # - targets: [localhost] + # labels: + # job: varlogs + # __path__: /var/logs/* + # - targets: [localhost] + # labels: + # job: varlogs + # __path__: /host/root/var/log/* + # - targets: [localhost] + # labels: + # job: varlogs + # __path__: /host/root/var/logs/* + # - targets: [localhost] + # labels: + # job: varlogs + # __path__: /host/root/var/log/containers/* + # - targets: [localhost] + # labels: + # job: varlogs + # __path__: /host/root/var/log/*/* + + + + + + + - name: kubernetes + clients: + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:3100/loki/api/v1/push + # basic_auth: + # username: YOUR_LOKI_USERNAME + # password: YOUR_LOKI_PASSWORD + external_labels: + #cluster: dev-cluster + positions: + filename: /tmp/kub-positions.yaml + target_config: + sync_period: 10s + scrape_configs: + - job_name: pod-logs + + kubernetes_sd_configs: + - role: pod + + pipeline_stages: + - cri: {} + + - regex: + #source: msg + expression: '(\s|\t|level=)?(?Ptrace|debug|info|warn|error|TRACE|DEBUG|INFO|WARN|ERROR)(\s|\t)' + + - labels: + level: + + relabel_configs: + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - target_label: job + # source_labels: + # - __meta_kubernetes_namespace + replacement: loki/ingester_grafana-agent + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - target_label: "job_name" + replacement: "kubernetes_sd" + + + + integrations: + + node_exporter: + enabled: true + rootfs_path: /host/root + sysfs_path: /host/sys + procfs_path: /host/proc + udev_data_path: /host/root/run/udev/data + + # collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/) + filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)" + filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$" + + scrape_integration: true + + include_exporter_metrics: true + enable_collectors: + - uname diff --git a/templates/Daemonset-GrafanaAgent.yaml b/templates/Daemonset-GrafanaAgent.yaml new file mode 100644 index 0000000..b04a6c3 --- /dev/null +++ b/templates/Daemonset-GrafanaAgent.yaml @@ -0,0 +1,136 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + metricsJob: node-exporter + cadvisormetricsJob: cadvisor + nodeExportermetricsJob: node + name: grafana-agent + namespace: "{{ .Values.nfc_monitoring.grafana_agent.namespace }}" +spec: + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + metricsJob: node-exporter + cadvisormetricsJob: cadvisor + nodeExportermetricsJob: node + template: + metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + metricsJob: node-exporter + cadvisormetricsJob: cadvisor + nodeExportermetricsJob: node + spec: + automountServiceAccountToken: true + containers: + - args: + - --server.http.address=0.0.0.0:12345 + - --config.file=/etc/agent/agent.yaml + name: grafana-agent + image: "{{ .Values.nfc_monitoring.grafana_agent.image.name }}:{{ .Values.nfc_monitoring.grafana_agent.image.tag }}" + #imagePullPolicy: Never + ports: + - containerPort: 12345 + name: grafana-metrics + protocol: TCP + resources: + limits: + cpu: 1000m + memory: 180Mi + requests: + cpu: 40m + memory: 180Mi + securityContext: + capabilities: + add: + - SYS_TIME + # drop: + # - ALL + readOnlyRootFilesystem: false + privileged: true + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/proc + mountPropagation: HostToContainer + name: proc + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: rootfs + readOnly: true + - mountPath: /var/log + mountPropagation: HostToContainer + name: logs + readOnly: true + - name: config + mountPath: "/etc/agent" + readOnly: false + - name: temp + mountPath: "/tmp" + readOnly: false + - name: agent-data + mountPath: "/etc/agent/data" + readOnly: false + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: /proc + name: proc + - hostPath: + path: / + name: rootfs + - hostPath: + path: /var/log + name: logs + - name: config + configMap: + name: grafana-agent + items: + - key: "agent.yaml" + path: "agent.yaml" + - name: temp + emptyDir: {} + - name: agent-data + emptyDir: {} + + - name: var-run + hostPath: + path: /var/run + - name: containerd + hostPath: + path: /var/lib/contairnerd + - name: disk + hostPath: + path: /dev/disk + + nodeSelector: + kubernetes.io/os: linux + hostNetwork: true + hostPID: true + priorityClassName: system-cluster-critical + serviceAccountName: grafana-agent + tolerations: + - operator: Exists diff --git a/templates/PrometheusRule-grafana-agent.yaml b/templates/PrometheusRule-grafana-agent.yaml new file mode 100644 index 0000000..6467f12 --- /dev/null +++ b/templates/PrometheusRule-grafana-agent.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: grafana-agent-promtail + name: grafana-agent + namespace: monitoring +spec: + groups: + - name: grafana_agent + rules: + # - annotations: + # description: "As Grafana Agent is being used, it's version is set as promtails" + - expr: | + agent_build_info + record: promtail_build_info diff --git a/templates/Service-GrafanaAgent.yaml b/templates/Service-GrafanaAgent.yaml new file mode 100644 index 0000000..fc070ab --- /dev/null +++ b/templates/Service-GrafanaAgent.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana-agent + namespace: monitoring + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + selector: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} + ports: + - name: grafana-metrics + port: 12345 + targetPort: grafana-metrics + - name: kube-ctrl-mgr + port: 11257 + targetPort: kube-ctrl-mgr + #type: LoadBalancer + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-GrafanaAgent.yaml b/templates/ServiceAccount-GrafanaAgent.yaml new file mode 100644 index 0000000..32363be --- /dev/null +++ b/templates/ServiceAccount-GrafanaAgent.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + name: grafana-agent + namespace: monitoring + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} From 955515de351d3b725bfbe8b3e7cb5f8edd328cb5 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:12:55 +0930 Subject: [PATCH 08/61] feat(service_monitor): use grafana agent for node exporter !1 --- templates/ServiceMonitor-Node.yaml | 43 ++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 templates/ServiceMonitor-Node.yaml diff --git a/templates/ServiceMonitor-Node.yaml b/templates/ServiceMonitor-Node.yaml new file mode 100644 index 0000000..1e21be8 --- /dev/null +++ b/templates/ServiceMonitor-Node.yaml @@ -0,0 +1,43 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + #cluster: dev + name: node + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 10s + honorLabels: true + path: /metrics + port: grafana-metrics + scheme: http + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + - targetLabel: "job" + replacement: "node-exporter" + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: node + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} From bb0521275581f44f6000829316f0660cde52c5ec Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:13:11 +0930 Subject: [PATCH 09/61] feat(service_monitor): use grafana agent for node node-exporter !1 --- templates/ServiceMonitor-node-exporter.yaml | 44 +++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 templates/ServiceMonitor-node-exporter.yaml diff --git a/templates/ServiceMonitor-node-exporter.yaml b/templates/ServiceMonitor-node-exporter.yaml new file mode 100644 index 0000000..478326f --- /dev/null +++ b/templates/ServiceMonitor-node-exporter.yaml @@ -0,0 +1,44 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: node-exporter + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 10s + honorLabels: true + path: /integrations/node_exporter/metrics + port: grafana-metrics + scheme: http + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + - targetLabel: "job" + replacement: "node-exporter" + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: node + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 534377edbdafab9c3939288330fc3e845445c898 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:14:29 +0930 Subject: [PATCH 10/61] feat: alert manager deployment !1 --- templates/AlertManager-k8s.yaml | 39 +++++ .../PodDisruptionBudget-alertmanager.yaml | 21 +++ templates/PrometheusRule-alertmanager.yaml | 141 ++++++++++++++++++ templates/Secret-alertmanager.yaml | 61 ++++++++ templates/Service-alertmanager.yaml | 27 ++++ templates/ServiceAccount-alertmanager.yaml | 14 ++ templates/serviceMonitor-alertmanager.yaml | 29 ++++ 7 files changed, 332 insertions(+) create mode 100644 templates/AlertManager-k8s.yaml create mode 100644 templates/PodDisruptionBudget-alertmanager.yaml create mode 100644 templates/PrometheusRule-alertmanager.yaml create mode 100644 templates/Secret-alertmanager.yaml create mode 100644 templates/Service-alertmanager.yaml create mode 100644 templates/ServiceAccount-alertmanager.yaml create mode 100644 templates/serviceMonitor-alertmanager.yaml diff --git a/templates/AlertManager-k8s.yaml b/templates/AlertManager-k8s.yaml new file mode 100644 index 0000000..1c2f654 --- /dev/null +++ b/templates/AlertManager-k8s.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + labels: + app.kubernetes.io/instance: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}" + nodeSelector: + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + replicas: 3 + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 4m + memory: 100Mi + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: alertmanager-main + version: 0.25.0 diff --git a/templates/PodDisruptionBudget-alertmanager.yaml b/templates/PodDisruptionBudget-alertmanager.yaml new file mode 100644 index 0000000..43ed6fe --- /dev/null +++ b/templates/PodDisruptionBudget-alertmanager.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/PrometheusRule-alertmanager.yaml b/templates/PrometheusRule-alertmanager.yaml new file mode 100644 index 0000000..988dad9 --- /dev/null +++ b/templates/PrometheusRule-alertmanager.yaml @@ -0,0 +1,141 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: alertmanager-main-rules + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/templates/Secret-alertmanager.yaml b/templates/Secret-alertmanager.yaml new file mode 100644 index 0000000..4e710be --- /dev/null +++ b/templates/Secret-alertmanager.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +stringData: + alertmanager.yaml: |- + "global": + "resolve_timeout": "5m" + "inhibit_rules": + - "equal": + - "namespace" + - "alertname" + "source_matchers": + - "severity = critical" + "target_matchers": + - "severity =~ warning|info" + - "equal": + - "namespace" + - "alertname" + "source_matchers": + - "severity = warning" + "target_matchers": + - "severity = info" + - "equal": + - "namespace" + "source_matchers": + - "alertname = InfoInhibitor" + "target_matchers": + - "severity = info" + "receivers": + - "name": "Default" + - "name": "Watchdog" + - "name": "Critical" + - "name": "null" + "route": + "group_by": + - "namespace" + "group_interval": "5m" + "group_wait": "30s" + "receiver": "Default" + "repeat_interval": "12h" + "routes": + - "matchers": + - "alertname = Watchdog" + "receiver": "Watchdog" + - "matchers": + - "alertname = InfoInhibitor" + "receiver": "null" + - "matchers": + - "severity = critical" + "receiver": "Critical" +type: Opaque diff --git a/templates/Service-alertmanager.yaml b/templates/Service-alertmanager.yaml new file mode 100644 index 0000000..974e77e --- /dev/null +++ b/templates/Service-alertmanager.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + ports: + - name: web + port: 9093 + targetPort: web + - name: reloader-web + port: 8080 + targetPort: reloader-web + selector: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-alertmanager.yaml b/templates/ServiceAccount-alertmanager.yaml new file mode 100644 index 0000000..f8c73b3 --- /dev/null +++ b/templates/ServiceAccount-alertmanager.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} diff --git a/templates/serviceMonitor-alertmanager.yaml b/templates/serviceMonitor-alertmanager.yaml new file mode 100644 index 0000000..c7f08fc --- /dev/null +++ b/templates/serviceMonitor-alertmanager.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager + namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + +spec: + endpoints: + - interval: 30s + port: web + - interval: 30s + port: reloader-web + namespaceSelector: + matchNames: + - "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 81136232f773429b2e26ecedc79a2a660b2d9abc Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:16:55 +0930 Subject: [PATCH 11/61] feat: kubestate metrics deployment !1 --- templates/ClusterRole-kubeStateMetrics-1.yaml | 20 +++ templates/ClusterRole-kubeStateMetrics.yaml | 132 ++++++++++++++++++ templates/Deployment-kubeStateMetrics.yaml | 112 +++++++++++++++ templates/NetworkPolicy-kubeStateMetrics.yaml | 31 ++++ .../PrometheusRule-kubeStateMetrics.yaml | 67 +++++++++ templates/Service-kubeStateMetrics.yaml | 28 ++++ .../ServiceAccount-kubeStateMetrics.yaml | 14 ++ .../ServiceMonitor-kubeStateMetrics.yaml | 46 ++++++ 8 files changed, 450 insertions(+) create mode 100644 templates/ClusterRole-kubeStateMetrics-1.yaml create mode 100644 templates/ClusterRole-kubeStateMetrics.yaml create mode 100644 templates/Deployment-kubeStateMetrics.yaml create mode 100644 templates/NetworkPolicy-kubeStateMetrics.yaml create mode 100644 templates/PrometheusRule-kubeStateMetrics.yaml create mode 100644 templates/Service-kubeStateMetrics.yaml create mode 100644 templates/ServiceAccount-kubeStateMetrics.yaml create mode 100644 templates/ServiceMonitor-kubeStateMetrics.yaml diff --git a/templates/ClusterRole-kubeStateMetrics-1.yaml b/templates/ClusterRole-kubeStateMetrics-1.yaml new file mode 100644 index 0000000..be81244 --- /dev/null +++ b/templates/ClusterRole-kubeStateMetrics-1.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/templates/ClusterRole-kubeStateMetrics.yaml b/templates/ClusterRole-kubeStateMetrics.yaml new file mode 100644 index 0000000..15e72ba --- /dev/null +++ b/templates/ClusterRole-kubeStateMetrics.yaml @@ -0,0 +1,132 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - serviceaccounts + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch +- apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + - ingressclasses + - ingresses + verbs: + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + - rolebindings + - roles + verbs: + - list + - watch diff --git a/templates/Deployment-kubeStateMetrics.yaml b/templates/Deployment-kubeStateMetrics.yaml new file mode 100644 index 0000000..cc60acc --- /dev/null +++ b/templates/Deployment-kubeStateMetrics.yaml @@ -0,0 +1,112 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: "{{ .Values.nfc_monitoring.kube_state_metrics.namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: kube-state-metrics + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + automountServiceAccountToken: true + containers: + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + image: "{{ .Values.nfc_monitoring.kube_state_metrics.image.name }}:{{ .Values.nfc_monitoring.kube_state_metrics.image.tag }}" + name: kube-state-metrics + resources: + limits: + cpu: 200m + memory: 250Mi + requests: + cpu: 10m + memory: 190Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 65534 + - args: + - --logtostderr + - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8081/ + image: quay.io/brancz/kube-rbac-proxy:v0.14.0 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 100m + memory: 40Mi + requests: + cpu: 20m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + - args: + - --logtostderr + - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8082/ + image: quay.io/brancz/kube-rbac-proxy:v0.14.0 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: kube-state-metrics diff --git a/templates/NetworkPolicy-kubeStateMetrics.yaml b/templates/NetworkPolicy-kubeStateMetrics.yaml new file mode 100644 index 0000000..a9b774f --- /dev/null +++ b/templates/NetworkPolicy-kubeStateMetrics.yaml @@ -0,0 +1,31 @@ +# apiVersion: networking.k8s.io/v1 +# kind: NetworkPolicy +# metadata: +# labels: +# app.kubernetes.io/component: exporter +# app.kubernetes.io/name: kube-state-metrics +# app.kubernetes.io/part-of: kube-prometheus +# app.kubernetes.io/version: 2.8.1 +# name: kube-state-metrics +# namespace: monitoring +# spec: +# egress: +# - {} +# ingress: +# - from: +# - podSelector: +# matchLabels: +# app.kubernetes.io/name: prometheus +# ports: +# - port: 8443 +# protocol: TCP +# - port: 9443 +# protocol: TCP +# podSelector: +# matchLabels: +# app.kubernetes.io/component: exporter +# app.kubernetes.io/name: kube-state-metrics +# app.kubernetes.io/part-of: kube-prometheus +# policyTypes: +# - Egress +# - Ingress diff --git a/templates/PrometheusRule-kubeStateMetrics.yaml b/templates/PrometheusRule-kubeStateMetrics.yaml new file mode 100644 index 0000000..18d2bb0 --- /dev/null +++ b/templates/PrometheusRule-kubeStateMetrics.yaml @@ -0,0 +1,67 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: kube-state-metrics-rules + namespace: monitoring +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors + summary: kube-state-metrics is experiencing errors in list operations. + expr: | + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors + summary: kube-state-metrics is experiencing errors in watch operations. + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: | + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: | + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical diff --git a/templates/Service-kubeStateMetrics.yaml b/templates/Service-kubeStateMetrics.yaml new file mode 100644 index 0000000..6e4e7b2 --- /dev/null +++ b/templates/Service-kubeStateMetrics.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https-main + port: 8443 + targetPort: https-main + - name: https-self + port: 9443 + targetPort: https-self + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: {{ $.Chart.Name }} + diff --git a/templates/ServiceAccount-kubeStateMetrics.yaml b/templates/ServiceAccount-kubeStateMetrics.yaml new file mode 100644 index 0000000..c4f59ba --- /dev/null +++ b/templates/ServiceAccount-kubeStateMetrics.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: monitoring diff --git a/templates/ServiceMonitor-kubeStateMetrics.yaml b/templates/ServiceMonitor-kubeStateMetrics.yaml new file mode 100644 index 0000000..20ca550 --- /dev/null +++ b/templates/ServiceMonitor-kubeStateMetrics.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kube_endpoint_address_not_ready|kube_endpoint_address_available + sourceLabels: + - __name__ + port: https-main + relabelings: + - action: labeldrop + regex: (pod|service|endpoint|namespace) + scheme: https + scrapeTimeout: 30s + tlsConfig: + insecureSkipVerify: true + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-self + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 762a9fa387be7e0cf9919075fd94705940a92a0b Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:19:47 +0930 Subject: [PATCH 12/61] feat: proxy deployment for kube scheduler controller metrics !1 --- templates/ClusterRole-kube-monitor-proxy.yaml | 21 +++ ...ClusterRoleBinding-kube-monitor-proxy.yaml | 20 +++ templates/Daemonset-kube-monitor-proxy.yaml | 136 ++++++++++++++++++ templates/Service-kube-monitor-proxy.yaml | 30 ++++ .../ServiceAccount-kube-monitor-proxy.yaml | 13 ++ ...erviceMonitor-kube-controller-manager.yaml | 82 +++++++++++ templates/ServiceMonitor-kube-scheduler.yaml | 36 +++++ 7 files changed, 338 insertions(+) create mode 100644 templates/ClusterRole-kube-monitor-proxy.yaml create mode 100644 templates/ClusterRoleBinding-kube-monitor-proxy.yaml create mode 100644 templates/Daemonset-kube-monitor-proxy.yaml create mode 100644 templates/Service-kube-monitor-proxy.yaml create mode 100644 templates/ServiceAccount-kube-monitor-proxy.yaml create mode 100644 templates/ServiceMonitor-kube-controller-manager.yaml create mode 100644 templates/ServiceMonitor-kube-scheduler.yaml diff --git a/templates/ClusterRole-kube-monitor-proxy.yaml b/templates/ClusterRole-kube-monitor-proxy.yaml new file mode 100644 index 0000000..e7b364a --- /dev/null +++ b/templates/ClusterRole-kube-monitor-proxy.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-monitor-proxy + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +rules: +- apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] diff --git a/templates/ClusterRoleBinding-kube-monitor-proxy.yaml b/templates/ClusterRoleBinding-kube-monitor-proxy.yaml new file mode 100644 index 0000000..0e4c1dd --- /dev/null +++ b/templates/ClusterRoleBinding-kube-monitor-proxy.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-monitor-proxy + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-monitor-proxy +subjects: +- kind: ServiceAccount + name: kube-monitor-proxy + namespace: monitoring diff --git a/templates/Daemonset-kube-monitor-proxy.yaml b/templates/Daemonset-kube-monitor-proxy.yaml new file mode 100644 index 0000000..a283196 --- /dev/null +++ b/templates/Daemonset-kube-monitor-proxy.yaml @@ -0,0 +1,136 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + + nodeExportermetricsJob: node + name_kcm: kube-controller-manager + + name: kube-monitor-proxy + namespace: "{{ .Values.nfc_monitoring.kube_monitor_proxy.namespace }}" +spec: + selector: + matchLabels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + template: + metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + automountServiceAccountToken: true + hostNetwork: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + containers: + + - args: + - --logtostderr + - --v=10 + - --secure-listen-address=[$(IP)]:11257 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=https://127.0.0.1:10257/ + - --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + env: + - name: NODE-IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}" + name: kube-rbac-proxy-kube-ctrl-mgr + ports: + - containerPort: 10257 + #hostPort: 9100 + name: kube-ctrl-mgr + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + + + - args: + - --logtostderr + - --v=10 + - --secure-listen-address=[$(IP)]:11259 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=https://127.0.0.1:10259/ + - --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + env: + - name: NODE-IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}" + name: kube-rbac-proxy-kube-scheduler + ports: + - containerPort: 10259 + #hostPort: 9100 + name: kube-scheduler + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + hostPID: true + priorityClassName: system-cluster-critical + serviceAccountName: kube-monitor-proxy + tolerations: + - operator: Exists diff --git a/templates/Service-kube-monitor-proxy.yaml b/templates/Service-kube-monitor-proxy.yaml new file mode 100644 index 0000000..a23d9dd --- /dev/null +++ b/templates/Service-kube-monitor-proxy.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-monitor-proxy + namespace: monitoring + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name_kcm: kube-controller-manager + name_ks: kube-scheduler +spec: + selector: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/part-of: {{ $.Chart.Name }} + + ports: + - name: kube-ctrl-mgr + port: 10257 + targetPort: kube-ctrl-mgr + - name: kube-scheduler + port: 10259 + targetPort: kube-scheduler + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-kube-monitor-proxy.yaml b/templates/ServiceAccount-kube-monitor-proxy.yaml new file mode 100644 index 0000000..b8e258f --- /dev/null +++ b/templates/ServiceAccount-kube-monitor-proxy.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-monitor-proxy + namespace: monitoring + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} diff --git a/templates/ServiceMonitor-kube-controller-manager.yaml b/templates/ServiceMonitor-kube-controller-manager.yaml new file mode 100644 index 0000000..21d30f2 --- /dev/null +++ b/templates/ServiceMonitor-kube-controller-manager.yaml @@ -0,0 +1,82 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-controller-manager + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name_kcm: kube-controller-manager + name: kube-controller-manager + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 10s + honorLabels: true + path: /metrics + port: kube-ctrl-mgr + scheme: https + # labels: + # job: kube-controller-manager + relabelings: + # - action: replace + # regex: (.*) + # replacement: $1 + # sourceLabels: + # - __meta_kubernetes_pod_node_name + # targetLabel: instance + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_service_label_name_kcm + targetLabel: job + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-kube-scheduler.yaml b/templates/ServiceMonitor-kube-scheduler.yaml new file mode 100644 index 0000000..71149ce --- /dev/null +++ b/templates/ServiceMonitor-kube-scheduler.yaml @@ -0,0 +1,36 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-scheduler + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-scheduler + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 10s + honorLabels: true + path: /metrics + port: kube-scheduler + scheme: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_service_label_name_ks + targetLabel: job + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/part-of: {{ $.Chart.Name }} From 53151ac6b960dff80e6168486ff3a0359745347c Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:20:42 +0930 Subject: [PATCH 13/61] feat(grafana): alert manager data source !1 --- templates/GrafanaDatasource-AlertManager.yaml | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 templates/GrafanaDatasource-AlertManager.yaml diff --git a/templates/GrafanaDatasource-AlertManager.yaml b/templates/GrafanaDatasource-AlertManager.yaml new file mode 100644 index 0000000..b954b84 --- /dev/null +++ b/templates/GrafanaDatasource-AlertManager.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: alertmanager + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: alertmanager + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + #secrets: + # - credentials + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + allowCrossNamespaceImport: true + datasource: + name: alertmanager + type: alertmanager + access: proxy + # basicAuth: true + url: "http://alertmanager-main.{{ .Values.nfc_monitoring.alert_manager.namespace }}.svc:9093" + isDefault: false + # user: admin + jsonData: + "tlsSkipVerify": true + "timeInterval": "5s" + "implementation": prometheus + "handleGrafanaManagedAlerts": false + # "orgId": 1 + # secureJsonData: + # "password": admin + editable: true \ No newline at end of file From 8631e56028e05b7c8aab6d239114d1c3b3304862 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:21:25 +0930 Subject: [PATCH 14/61] feat(metrics): api service monitor !1 --- templates/ServiceMonitor-APIServer.yaml | 78 +++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 templates/ServiceMonitor-APIServer.yaml diff --git a/templates/ServiceMonitor-APIServer.yaml b/templates/ServiceMonitor-APIServer.yaml new file mode 100644 index 0000000..7d423d9 --- /dev/null +++ b/templates/ServiceMonitor-APIServer.yaml @@ -0,0 +1,78 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: apiserver + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: kube-apiserver + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|server).* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) + sourceLabels: + - __name__ + - le + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes From bf44961e02aa076f4dc0a5241359372d0e391b37 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:21:50 +0930 Subject: [PATCH 15/61] feat(metrics): CAdvisor service monitor !1 --- templates/ServiceMonitor-Cadvisor.yaml | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 templates/ServiceMonitor-Cadvisor.yaml diff --git a/templates/ServiceMonitor-Cadvisor.yaml b/templates/ServiceMonitor-Cadvisor.yaml new file mode 100644 index 0000000..1e8be30 --- /dev/null +++ b/templates/ServiceMonitor-Cadvisor.yaml @@ -0,0 +1,52 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: cadvisor + app.kubernetes.io/component: exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: cadvisor + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + - action: drop + regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; + sourceLabels: + - __name__ + - pod + - namespace + - action: drop + regex: (container_blkio_device_usage_total);.+ + sourceLabels: + - __name__ + - container + path: /metrics/cadvisor + port: https-metrics + #jobName: cadvisor + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - targetLabel: "job" + replacement: "cadvisor" + scheme: https + tlsConfig: + insecureSkipVerify: true + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet From 8699d66347106ee85ac775acc625f069494b02b7 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:22:03 +0930 Subject: [PATCH 16/61] feat(metrics): CoreDNS service monitor !1 --- templates/ServiceMonitor-CoreDNS.yaml | 28 +++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 templates/ServiceMonitor-CoreDNS.yaml diff --git a/templates/ServiceMonitor-CoreDNS.yaml b/templates/ServiceMonitor-CoreDNS.yaml new file mode 100644 index 0000000..5c96bb3 --- /dev/null +++ b/templates/ServiceMonitor-CoreDNS.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: coredns + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: coredns + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + metricRelabelings: + - action: drop + regex: coredns_cache_misses_total + sourceLabels: + - __name__ + port: metrics + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-dns From 59316081cd02676b25e88c79c99bc142918b8c07 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:22:15 +0930 Subject: [PATCH 17/61] feat(metrics): Kubelet service monitor !1 --- templates/ServiceMonitor-Kubelet.yaml | 87 +++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 templates/ServiceMonitor-Kubelet.yaml diff --git a/templates/ServiceMonitor-Kubelet.yaml b/templates/ServiceMonitor-Kubelet.yaml new file mode 100644 index 0000000..b921127 --- /dev/null +++ b/templates/ServiceMonitor-Kubelet.yaml @@ -0,0 +1,87 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: kubelet + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_endpoint_address_target_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_endpoint_address_target_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet From fe28c6a24cc786fbcc818f21f6dd971b7776110a Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:22:55 +0930 Subject: [PATCH 18/61] feat(metrics): Node-Exporter Prometheus rules !1 --- templates/PrometheusRule-nodeExporter.yaml | 318 +++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 templates/PrometheusRule-nodeExporter.yaml diff --git a/templates/PrometheusRule-nodeExporter.yaml b/templates/PrometheusRule-nodeExporter.yaml new file mode 100644 index 0000000..5063919 --- /dev/null +++ b/templates/PrometheusRule-nodeExporter.yaml @@ -0,0 +1,318 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: monitoring +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{ `{{` }} $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{ `{{` }} $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ `{{` }} $labels.device }}' on {{ `{{` }} $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ `{{` }} $labels.instance }} failed. Array '{{ `{{` }} $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - name: node-exporter.rules + rules: + - expr: | + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m From b5d7a24d28f9f418d9bca5e47d64c18cd9309986 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:23:22 +0930 Subject: [PATCH 19/61] feat(metrics): Kubernetes control plane Prometheus rules !1 --- ...PrometheusRule-kubernetesControlPlane.yaml | 1441 +++++++++++++++++ 1 file changed, 1441 insertions(+) create mode 100644 templates/PrometheusRule-kubernetesControlPlane.yaml diff --git a/templates/PrometheusRule-kubernetesControlPlane.yaml b/templates/PrometheusRule-kubernetesControlPlane.yaml new file mode 100644 index 0000000..a41dc70 --- /dev/null +++ b/templates/PrometheusRule-kubernetesControlPlane.yaml @@ -0,0 +1,1441 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: kubernetes-monitoring-rules + namespace: monitoring +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: 'Pod {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} ({{ `{{` }} $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod, cluster) ( + max by(namespace, pod, cluster) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} + ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( + 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_deployment_spec_replicas{job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.statefulset }} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: pod/{{ `{{` }} $labels.pod }} in namespace {{ `{{` }} $labels.namespace }} on container {{ `{{` }} $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ `{{` }} $value }} Pods of DaemonSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.daemonset }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ `{{` }} $value }} Pods of DaemonSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobNotCompleted + annotations: + description: Job {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.job_name }} is taking more than {{ `{{` }} "43200" | humanizeDuration }} to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted + summary: Job did not complete in time + expr: | + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} + and + kube_job_status_active{job="kube-state-metrics"} > 0) > 43200 + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched desired number of replicas. + expr: | + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods by {{ `{{` }} $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + and + (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods by {{ `{{` }} $value | humanize }} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + and + (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ `{{` }} $labels.namespace }} is using {{ `{{` }} $value | humanizePercentage }} of its {{ `{{` }} $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ `{{` }} $labels.namespace }} is using {{ `{{` }} $value | humanizePercentage }} of its {{ `{{` }} $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ `{{` }} $labels.namespace }} is using {{ `{{` }} $value | humanizePercentage }} of its {{ `{{` }} $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} throttling of CPU in namespace {{ `{{` }} $labels.namespace }} for container {{ `{{` }} $labels.container }} in pod {{ `{{` }} $labels.pod }}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} is only {{ `{{` }} $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} is expected to fill up within four days. Currently {{ `{{` }} $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: The PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} only has {{ `{{` }} $value | humanizePercentage }} free inodes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: | + ( + kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{job="kubelet", metrics_path="/metrics"} > 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} is expected to run out of inodes within four days. Currently {{ `{{` }} $value | humanizePercentage }} of its inodes are free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: | + ( + kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ `{{` }} $labels.persistentvolume }} has status {{ `{{` }} $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ `{{` }} $value }} different semantic versions of Kubernetes components running. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ `{{` }} $labels.job }}/{{ `{{` }} $labels.instance }}' is experiencing {{ `{{` }} $value | humanizePercentage }} errors.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) + / + sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + for: 5m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + for: 5m + labels: + severity: critical + - alert: KubeAggregatedAPIErrors + annotations: + description: Kubernetes aggregated API {{ `{{` }} $labels.name }}/{{ `{{` }} $labels.namespace }} has reported errors. It has appeared unavailable {{ `{{` }} $value | humanize }} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors + summary: Kubernetes aggregated API has reported errors. + expr: | + sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + labels: + severity: warning + - alert: KubeAggregatedAPIDown + annotations: + description: Kubernetes aggregated API {{ `{{` }} $labels.name }}/{{ `{{` }} $labels.namespace }} has been only {{ `{{` }} $value | humanize }}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown + summary: Kubernetes aggregated API is down. + expr: | + (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The kubernetes apiserver has terminated {{ `{{` }} $value | humanizePercentage }} of its incoming requests. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests + summary: The kubernetes apiserver has terminated {{ `{{` }} $value | humanizePercentage }} of its incoming requests. + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ `{{` }} $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ `{{` }} $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ `{{` }} $labels.node }}' is running at {{ `{{` }} $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(cluster, node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(cluster, node) ( + kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + ) > 0.95 + for: 15m + labels: + severity: info + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ `{{` }} $labels.node }} has changed {{ `{{` }} $value }} times in the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ `{{` }} $value }} seconds on node {{ `{{` }} $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ `{{` }} $value }} seconds on node {{ `{{` }} $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ `{{` }} $labels.node }} has failed to renew its client certificate ({{ `{{` }} $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ `{{` }} $labels.node }} has failed to renew its server certificate ({{ `{{` }} $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - name: kube-apiserver-burnrate.rules + rules: + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h + - name: kube-apiserver-histogram.rules + rules: + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules + rules: + - expr: | + avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h])) + record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h + - expr: | + sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d + - expr: | + sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h + - expr: | + sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d + - expr: | + 1 - ( + ( + # write too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + ) + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - name: k8s.rules + rules: + - expr: | + sum by (cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - expr: | + container_memory_working_set_bytes{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: | + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: | + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules + rules: + - expr: | + topk by(cluster, namespace, pod) (1, + max by (cluster, node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (cluster, node) ( + node_cpu_seconds_total{mode="idle",job="node-exporter"} + * on (namespace, pod) group_left(node) + topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) + ) + record: node:node_num_cpu:sum + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - expr: | + avg by (cluster, node) ( + sum without (mode) ( + rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) + ) + ) + record: node:node_cpu_utilization:ratio_rate5m + - expr: | + avg by (cluster) ( + node:node_cpu_utilization:ratio_rate5m + ) + record: cluster:node_cpu:ratio_rate5m + - name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile From 9880c5d988fce19aa59aa143c3c2c6e9cf5df88b Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:23:42 +0930 Subject: [PATCH 20/61] feat(metrics): Kubeprometheus Prometheus rules !1 --- templates/PrometheusRule-kubePrometheus.yaml | 86 ++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 templates/PrometheusRule-kubePrometheus.yaml diff --git a/templates/PrometheusRule-kubePrometheus.yaml b/templates/PrometheusRule-kubePrometheus.yaml new file mode 100644 index 0000000..42ad467 --- /dev/null +++ b/templates/PrometheusRule-kubePrometheus.yaml @@ -0,0 +1,86 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: kube-prometheus-rules + namespace: monitoring +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ `{{` }} printf "%.4g" $value }}% of the {{ `{{` }} $labels.job }}/{{ `{{` }} $labels.service }} targets in {{ `{{` }} $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + severity: none + - alert: InfoInhibitor + annotations: + description: | + This is an alert that is used to inhibit info alerts. + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with + other alerts. + This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a + severity of 'warning' or 'critical' starts firing on the same namespace. + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor + summary: Info-level alert inhibition. + expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ `{{` }} $labels.device }}" changing its up status often on node-exporter {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 From 38730de0a7816e48a88f91179e914b69a01864e9 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:25:36 +0930 Subject: [PATCH 21/61] chore: initial chart and values yaml !1 --- Chart.yaml | 26 +++++++++++ values.yaml | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 Chart.yaml create mode 100644 values.yaml diff --git a/Chart.yaml b/Chart.yaml new file mode 100644 index 0000000..8948a1e --- /dev/null +++ b/Chart.yaml @@ -0,0 +1,26 @@ +apiVersion: v2 +name: nfc-monitoring +description: | + This helm chart installs the required components for full stack monitoring. + + The purpose of this chart is to provide all components required to monitor kubernetes within its entirety. + Storage of the monitoring data is not part of this chart nor will it be included. For this reason this chart can be added and removed without loosing your monitoring data. + + For this chart to function correctly the following operators are required: + + - [grafana](https://grafana-operator.github.io/grafana-operator/docs/installation/helm/) + + - [prometheus](https://prometheus-operator.dev/docs/user-guides/getting-started/) + + Full Docs available at http://nofusscomputing.com/projects/kubernetes_monitoring. + +type: application + + +version: 0.1.0 + + +appVersion: "0.1.0" + + +dependencies: [] diff --git a/values.yaml b/values.yaml new file mode 100644 index 0000000..e52eea1 --- /dev/null +++ b/values.yaml @@ -0,0 +1,126 @@ +--- + +# All values within this helm chart values.yaml file are under namespace `nfc_monitoring`. +# this provides the opportunity to include this helm chart as a dependency without +# variable collision + +nfc_monitoring: + + kubernetes: + cluster_dns_name: cluster.local + + + alert_manager: + image: + name: quay.io/prometheus/alertmanager + tag: 'v0.25.0' + namespace: alerting + + + grafana: + image: + name: grafana/grafana + tag: '10.0.5' + namespace: grafana + replicas: 1 + + admin_user: admin + admin_password: admin + + + grafana_agent: + image: + name: grafana/agent + tag: 'v0.36.1' + namespace: monitoring + + + loki: + image: + name: grafana/loki + tag: 2.7.4 + namespace: loki + service_name: loki + + + kube_monitor_proxy: + namespace: monitoring + + + kube_rbac_proxy: + # This image is used as part of kube-monitor-proxy. + image: + name: quay.io/brancz/kube-rbac-proxy + tag: 'v0.14.2' + + + kube_state_metrics: + image: + name: registry.k8s.io/kube-state-metrics/kube-state-metrics + tag: 'v2.8.1' + namespace: monitoring + + + prometheus: + image: + name: prom/prometheus + tag: 'v2.42.0' + namespace: monitoring + + + prometheus_adaptor: + image: + name: registry.k8s.io/prometheus-adapter/prometheus-adapter + tag: 'v0.11.1' + namespace: monitoring + + +loki_instance: + image: + name: grafana/loki + tag: 2.7.4 + # tag: 2.9.0 + namespace: loki + + +oncall_instance: + image: + name: grafana/oncall + tag: v1.1.40 + + +# oncall: + +# # image: +# # # Grafana OnCall docker image repository +# # repository: grafana/oncall +# # tag: v1.1.38 +# # pullPolicy: Always + +# service: +# enabled: false +# type: LoadBalancer +# port: 8080 +# annotations: {} + +# engine: +# replicaCount: 1 +# resources: +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 100m +# memory: 128Mi + +# celery: +# replicaCount: 1 +# resources: +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 100m +# memory: 128Mi +# database: +# type: none From fe2cc1352a1bd5dd49f2d3a3ac64b12916ad9dbf Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 15:54:49 +0930 Subject: [PATCH 22/61] feat(metrics): ceph service monitor to enable use feature flag in values !1 --- templates/ServiceMonitor-ceph.yaml | 39 ++++++++++++++++++++++++++++++ values.yaml | 15 ++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 templates/ServiceMonitor-ceph.yaml diff --git a/templates/ServiceMonitor-ceph.yaml b/templates/ServiceMonitor-ceph.yaml new file mode 100644 index 0000000..71989d6 --- /dev/null +++ b/templates/ServiceMonitor-ceph.yaml @@ -0,0 +1,39 @@ +--- +{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: ceph + app.kubernetes.io/component: storage + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: ceph + namespace: "{{ .Values.nfc_monitoring.additions.ceph.namespace }}" +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: http-metrics + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - targetLabel: "job" + replacement: "ceph" + scheme: http + targetLabels: + - cluster + selector: +{{ .Values.nfc_monitoring.additions.ceph.ServiceMonitor.selector | toYaml | indent 4 }} + + +{{- end -}} diff --git a/values.yaml b/values.yaml index e52eea1..4543015 100644 --- a/values.yaml +++ b/values.yaml @@ -74,6 +74,21 @@ nfc_monitoring: tag: 'v0.11.1' namespace: monitoring + + additions: + + ceph: + + enabled: true + + namespace: ceph + + ServiceMonitor: + + selector: + matchLabels: + app: rook-ceph-mgr + loki_instance: image: From 16965c73046c4a4d463b5a36e246920fcad3f3fe Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 16:04:14 +0930 Subject: [PATCH 23/61] feat(metrics): ceph prometheus rules to enable use feature flag in values !1 --- templates/prometheusRule-ceph.yaml | 673 +++++++++++++++++++++++++++++ values.yaml | 2 + 2 files changed, 675 insertions(+) create mode 100644 templates/prometheusRule-ceph.yaml diff --git a/templates/prometheusRule-ceph.yaml b/templates/prometheusRule-ceph.yaml new file mode 100644 index 0000000..dc53b82 --- /dev/null +++ b/templates/prometheusRule-ceph.yaml @@ -0,0 +1,673 @@ +--- +{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}} + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: storage + app.kubernetes.io/name: ceph + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: ceph-rules + namespace: monitoring +spec: + groups: + - name: "cluster health" + rules: + - alert: "CephHealthError" + annotations: + description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information." + summary: "Ceph is in the ERROR state" + expr: "ceph_health_status == 2" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.2.1" + severity: "critical" + type: "ceph_default" + - alert: "CephHealthWarning" + annotations: + description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information." + summary: "Ceph is in the WARNING state" + expr: "ceph_health_status == 1" + for: "15m" + labels: + severity: "warning" + type: "ceph_default" + - name: "mon" + rules: + - alert: "CephMonDownQuorumAtRisk" + annotations: + description: "{{ `{{` }} $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ `{{` }} $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{ `{{` }}- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down" + summary: "Monitor quorum is at risk" + expr: | + ( + (ceph_health_detail{name="MON_DOWN"} == 1) * on() ( + count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1) + ) + ) == 1 + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.3.1" + severity: "critical" + type: "ceph_default" + - alert: "CephMonDown" + annotations: + description: | + {{ `{{` }} $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ `{{` }} $s := "" }}{{ `{{` }} if gt $down 1.0 }}{{ `{{` }} $s = "s" }}{{ `{{` }} end }}You have {{ `{{` }} $down }} monitor{{ `{{` }} $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{ `{{` }}- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }} + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down" + summary: "One or more monitors down" + expr: | + count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1) + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephMonDiskspaceCritical" + annotations: + description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{ `{{` }}- range query \"ceph_mon_metadata\"}} - {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit" + summary: "Filesystem space on at least one monitor is critically low" + expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.3.2" + severity: "critical" + type: "ceph_default" + - alert: "CephMonDiskspaceLow" + annotations: + description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{ `{{` }}- range query \"ceph_mon_metadata\"}} - {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low" + summary: "Drive space on at least one monitor is approaching full" + expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephMonClockSkew" + annotations: + description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew" + summary: "Clock skew detected among monitors" + expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - name: "osd" + rules: + - alert: "CephOSDDownHigh" + annotations: + description: "{{ `{{` }} $value | humanize }}% or {{ `{{` }} with query \"count(ceph_osd_up == 0)\" }}{{ `{{` }} . | first | value }}{{ `{{` }} end }} of {{ `{{` }} with query \"count(ceph_osd_up)\" }}{{ `{{` }} . | first | value }}{{ `{{` }} end }} OSDs are down (>= 10%). The following OSDs are down: {{ `{{` }}- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + summary: "More than 10% of OSDs are down" + expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.1" + severity: "critical" + type: "ceph_default" + - alert: "CephOSDHostDown" + annotations: + description: "The following OSDs are down: {{ `{{` }}- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ `{{` }} .Labels.hostname }} : {{ `{{` }} .Labels.ceph_daemon }} {{ `{{` }}- end }}" + summary: "An OSD host is offline" + expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.8" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDDown" + annotations: + description: | + {{ `{{` }} $num := query "count(ceph_osd_up == 0)" | first | value }}{{ `{{` }} $s := "" }}{{ `{{` }} if gt $num 1.0 }}{{ `{{` }} $s = "s" }}{{ `{{` }} end }}{{ `{{` }} $num }} OSD{{ `{{` }} $s }} down for over 5mins. The following OSD{{ `{{` }} $s }} {{ `{{` }} if eq $s "" }}is{{ `{{` }} else }}are{{ `{{` }} end }} down: {{ `{{` }}- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }} + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down" + summary: "An OSD has been marked down" + expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.2" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDNearFull" + annotations: + description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull" + summary: "OSD(s) running low on free space (NEARFULL)" + expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.3" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDFull" + annotations: + description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full" + summary: "OSD full, writes blocked" + expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.6" + severity: "critical" + type: "ceph_default" + - alert: "CephOSDBackfillFull" + annotations: + description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull" + summary: "OSD(s) too full for backfill operations" + expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTooManyRepairs" + annotations: + description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs" + summary: "OSD reports a high number of read errors" + expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTimeoutsPublicNetwork" + annotations: + description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs." + summary: "Network issues delaying OSD heartbeats (public network)" + expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTimeoutsClusterNetwork" + annotations: + description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs." + summary: "Network issues delaying OSD heartbeats (cluster network)" + expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDInternalDiskSizeMismatch" + annotations: + description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch" + summary: "OSD size inconsistency error" + expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDeviceFailurePredicted" + annotations: + description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info '. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2" + summary: "Device(s) predicted to fail soon" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDeviceFailurePredictionTooHigh" + annotations: + description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany" + summary: "Too many devices are predicted to fail, unable to resolve" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.7" + severity: "critical" + type: "ceph_default" + - alert: "CephDeviceFailureRelocationIncomplete" + annotations: + description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use" + summary: "Device failure is predicted, but unable to relocate data" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDFlapping" + annotations: + description: "OSD {{ `{{` }} $labels.ceph_daemon }} on {{ `{{` }} $labels.hostname }} was marked down and back up {{ `{{` }} $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)." + documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds" + summary: "Network issues are causing OSDs to flap (mark each other down)" + expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.4" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDReadErrors" + annotations: + description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors" + summary: "Device read errors detected" + expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGImbalance" + annotations: + description: "OSD {{ `{{` }} $labels.ceph_daemon }} on {{ `{{` }} $labels.hostname }} deviates by more than 30% from average PG count." + summary: "PGs are not balanced across OSDs" + expr: | + abs( + ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / + on (job) group_left avg(ceph_osd_numpg > 0) by (job) + ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.5" + severity: "warning" + type: "ceph_default" + - name: "mds" + rules: + - alert: "CephFilesystemDamaged" + annotations: + description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages" + summary: "CephFS filesystem is damaged." + expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.1" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemOffline" + annotations: + description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down" + summary: "CephFS filesystem is offline" + expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.3" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemDegraded" + annotations: + description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded" + summary: "CephFS filesystem is degraded" + expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.4" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemMDSRanksLow" + annotations: + description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max" + summary: "Ceph MDS daemon count is lower than configured" + expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephFilesystemInsufficientStandby" + annotations: + description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby" + summary: "Ceph filesystem standby daemons too few" + expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephFilesystemFailureNoStandby" + annotations: + description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds" + summary: "MDS daemon failed, no further standby available" + expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.5" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemReadOnly" + annotations: + description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages" + summary: "CephFS filesystem in read only mode due to write error(s)" + expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.2" + severity: "critical" + type: "ceph_default" + - name: "mgr" + rules: + - alert: "CephMgrModuleCrash" + annotations: + description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash" + summary: "A manager module has recently crashed" + expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.6.1" + severity: "critical" + type: "ceph_default" + - alert: "CephMgrPrometheusModuleInactive" + annotations: + description: "The mgr/prometheus module at {{ `{{` }} $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'." + summary: "The mgr/prometheus module is not available" + expr: "up{job=\"ceph\"} == 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.6.2" + severity: "critical" + type: "ceph_default" + - name: "pgs" + rules: + - alert: "CephPGsInactive" + annotations: + description: "{{ `{{` }} $value }} PGs have been inactive for more than 5 minutes in pool {{ `{{` }} $labels.name }}. Inactive placement groups are not able to serve read/write requests." + summary: "One or more placement groups are inactive" + expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.1" + severity: "critical" + type: "ceph_default" + - alert: "CephPGsUnclean" + annotations: + description: "{{ `{{` }} $value }} PGs have been unclean for more than 15 minutes in pool {{ `{{` }} $labels.name }}. Unclean PGs have not recovered from a previous failure." + summary: "One or more placement groups are marked unclean" + expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0" + for: "15m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.2" + severity: "warning" + type: "ceph_default" + - alert: "CephPGsDamaged" + annotations: + description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' command." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged" + summary: "Placement group damaged, manual intervention needed" + expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.4" + severity: "critical" + type: "ceph_default" + - alert: "CephPGRecoveryAtRisk" + annotations: + description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full" + summary: "OSDs are too full for recovery" + expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.5" + severity: "critical" + type: "ceph_default" + - alert: "CephPGUnavailableBlockingIO" + annotations: + description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability" + summary: "PG is unavailable, blocking I/O" + expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.3" + severity: "critical" + type: "ceph_default" + - alert: "CephPGBackfillAtRisk" + annotations: + description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full" + summary: "Backfill operations are blocked due to lack of free space" + expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.6" + severity: "critical" + type: "ceph_default" + - alert: "CephPGNotScrubbed" + annotations: + description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub " + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed" + summary: "Placement group(s) have not been scrubbed" + expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGsHighPerOSD" + annotations: + description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs" + summary: "Placement groups per OSD is too high" + expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGNotDeepScrubbed" + annotations: + description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed" + summary: "Placement group(s) have not been deep scrubbed" + expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - name: "nodes" + rules: + - alert: "CephNodeRootFilesystemFull" + annotations: + description: "Root volume is dangerously full: {{ `{{` }} $value | humanize }}% free." + summary: "Root filesystem is dangerously full" + expr: "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.1" + severity: "critical" + type: "ceph_default" + - alert: "CephNodeNetworkPacketDrops" + annotations: + description: "Node {{ `{{` }} $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ `{{` }} $labels.device }}." + summary: "One or more NICs reports packet drops" + expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.2" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeNetworkPacketErrors" + annotations: + description: "Node {{ `{{` }} $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ `{{` }} $labels.device }}." + summary: "One or more NICs reports packet errors" + expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.3" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeNetworkBondDegraded" + annotations: + summary: "Degraded Bond on Node {{ `{{` }} $labels.instance }}" + description: "Bond {{ `{{` }} $labels.master }} is degraded on Node {{ `{{` }} $labels.instance }}." + expr: | + node_bonding_slaves - node_bonding_active != 0 + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephNodeDiskspaceWarning" + annotations: + description: "Mountpoint {{ `{{` }} $labels.mountpoint }} on {{ `{{` }} $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate." + summary: "Host filesystem free space is getting low" + expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.4" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeInconsistentMTU" + annotations: + description: "Node {{ `{{` }} $labels.instance }} has a different MTU size ({{ `{{` }} $value }}) than the median of devices named {{ `{{` }} $labels.device }}." + summary: "MTU settings across Ceph hosts are inconsistent" + expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )" + labels: + severity: "warning" + type: "ceph_default" + - name: "pools" + rules: + - alert: "CephPoolGrowthWarning" + annotations: + description: "Pool '{{ `{{` }} $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours." + summary: "Pool growth rate may soon exceed capacity" + expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.9.2" + severity: "warning" + type: "ceph_default" + - alert: "CephPoolBackfillFull" + annotations: + description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity." + summary: "Free space in a pool is too low for recovery/backfill" + expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPoolFull" + annotations: + description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{ `{{` }}- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ `{{` }} .Labels.name }} at {{ `{{` }} .Value }}% {{ `{{` }}- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes )" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full" + summary: "Pool is full - writes are blocked" + expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.9.1" + severity: "critical" + type: "ceph_default" + - alert: "CephPoolNearFull" + annotations: + description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ). Also ensure that the balancer is active." + summary: "One or more Ceph pools are nearly full" + expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - name: "healthchecks" + rules: + - alert: "CephSlowOps" + annotations: + description: "{{ `{{` }} $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + summary: "OSD operations are slow to complete" + expr: "ceph_healthcheck_slow_ops > 0" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDaemonSlowOps" + for: "30s" + expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" + labels: + severity: 'warning' + type: 'ceph_default' + annotations: + summary: "{{ `{{` }} $labels.ceph_daemon }} operations are slow to complete" + description: "{{ `{{` }} $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + - name: "cephadm" + rules: + - alert: "CephadmUpgradeFailed" + annotations: + description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue" + summary: "Ceph version upgrade has failed" + expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.11.2" + severity: "critical" + type: "ceph_default" + - alert: "CephadmDaemonFailed" + annotations: + description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start '" + summary: "A ceph daemon managed by cephadm is down" + expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.11.1" + severity: "critical" + type: "ceph_default" + - alert: "CephadmPaused" + annotations: + description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'" + documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused" + summary: "Orchestration tasks via cephadm are PAUSED" + expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - name: "PrometheusServer" + rules: + - alert: "PrometheusJobMissing" + annotations: + description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance." + summary: "The scrape job for Ceph is missing from Prometheus" + expr: "absent(up{job=\"ceph\"})" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.12.1" + severity: "critical" + type: "ceph_default" + - name: "rados" + rules: + - alert: "CephObjectMissing" + annotations: + description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound" + summary: "Object(s) marked UNFOUND" + expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.1" + severity: "critical" + type: "ceph_default" + - name: "generic" + rules: + - alert: "CephDaemonCrash" + annotations: + description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash" + summary: "One or more Ceph daemons have crashed, and are pending acknowledgement" + expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.1.2" + severity: "critical" + type: "ceph_default" + +{{- end -}} \ No newline at end of file diff --git a/values.yaml b/values.yaml index 4543015..311c999 100644 --- a/values.yaml +++ b/values.yaml @@ -83,6 +83,8 @@ nfc_monitoring: namespace: ceph + PrometheusRules: true + ServiceMonitor: selector: From 6c8298b0bb57e00f7aa7ee0deae7d9175c9a5347 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 16:14:54 +0930 Subject: [PATCH 24/61] feat(metrics): loki service monitor to enable use feature flag in values !1 --- templates/ServiceMonitor-Loki.yaml | 37 ++++++++++++++++++++++++++++++ values.yaml | 11 +++++++++ 2 files changed, 48 insertions(+) create mode 100644 templates/ServiceMonitor-Loki.yaml diff --git a/templates/ServiceMonitor-Loki.yaml b/templates/ServiceMonitor-Loki.yaml new file mode 100644 index 0000000..5a90a5c --- /dev/null +++ b/templates/ServiceMonitor-Loki.yaml @@ -0,0 +1,37 @@ +--- +{{- if .Values.nfc_monitoring.loki.enabled | default false -}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: loki + app.kubernetes.io/component: logging + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + + name: loki + namespace: {{ .Values.nfc_monitoring.loki.namespace }} +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: loki + # relabelings: + # - action: replace + # regex: (.*) + # replacement: $1 + # sourceLabels: + # - __meta_kubernetes_pod_node_name + # targetLabel: instance + scheme: http + # tlsConfig: + # insecureSkipVerify: true + targetLabels: + - cluster + jobLabel: app.kubernetes.io/name + selector: +{{ .Values.nfc_monitoring.loki.ServiceMonitor.selector | toYaml | indent 4 }} + +{{- end -}} diff --git a/values.yaml b/values.yaml index 311c999..81fa8cb 100644 --- a/values.yaml +++ b/values.yaml @@ -36,12 +36,23 @@ nfc_monitoring: loki: + + enabled: false + image: name: grafana/loki tag: 2.7.4 + namespace: loki + service_name: loki + ServiceMonitor: + selector: + matchLabels: + app.kubernetes.io/name: loki + app.kubernetes.io/component: logging + kube_monitor_proxy: namespace: monitoring From 8f49ab08b4759921b6577384b5456940d5026288 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 16:29:07 +0930 Subject: [PATCH 25/61] feat(grafana): loki data source to enable use feature flag in values !1 --- templates/GrafanaDatasource-loki.yaml | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 templates/GrafanaDatasource-loki.yaml diff --git a/templates/GrafanaDatasource-loki.yaml b/templates/GrafanaDatasource-loki.yaml new file mode 100644 index 0000000..18e9f5f --- /dev/null +++ b/templates/GrafanaDatasource-loki.yaml @@ -0,0 +1,41 @@ +--- +{{- if .Values.nfc_monitoring.loki.enabled | default false -}} + +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: loki + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + aapp.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: loki + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + #secrets: + # - credentials + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + allowCrossNamespaceImport: true + datasource: + name: loki + type: loki + access: proxy + # basicAuth: true + url: "http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:3100" + isDefault: false + # user: admin + jsonData: + # "tlsSkipVerify": true + "timeInterval": "5s" + "orgId": 1 + # secureJsonData: + # "password": admin + editable: true + +{{- end -}} \ No newline at end of file From 6ffcc76acf44e3fa124d21a5f2c3b6d169d342b3 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 17:01:02 +0930 Subject: [PATCH 26/61] docs: added initial docs !1 --- docs/projects/kubernetes_monitoring/index.md | 61 ++++++++++++++++++-- 1 file changed, 57 insertions(+), 4 deletions(-) diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md index f7f07fc..9fc69af 100644 --- a/docs/projects/kubernetes_monitoring/index.md +++ b/docs/projects/kubernetes_monitoring/index.md @@ -1,18 +1,71 @@ --- title: Kubernetes Monitoring Helm Chart description: How to use No Fuss Computings kubernetes monitoring helm chart for full stack monitoring. -date: 2023-08-29 +date: 2023-09-19 template: project.html about: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring --- +This Helm chart deploys the components needed for full stack monitoring. -## Running the container +What this chart is: -To quickly setup a container the following `docker-compose.yaml` file could be used. +- Collectors for metrics and logging -``` yaml title="docker-compose.yaml" linenums="1" +- Distribution / Routing of metrics/logging + +- Data visualization + +What this chart is not: + +- A complete monitoring stack inclusive of long term storage. + +Intentionally we have kept this helm chart to the monitoring components only. Keeping the monitoring components seperate from storage is advantageous. for all intents and purposes this helm chart is ephemeral. + +This helm chart started off with components from multiple open-source projects. As such attribution is warrented, so head on over to thier projects and give them a star. Projects used to create the building blocks of this helm chart are + +- [Kube prometheus](https://github.com/prometheus-operator/kube-prometheus) + +- [prometheus adaptor](https://github.com/kubernetes-sigs/prometheus-adapter) +## Features + +- Prometheus as the metrics collector + +- Grafana-Agent as the daemonset for node and node-exporter metrics and also as the logging injestor for loki + +- Grafana as the visualization frontend + + +## Installation + +This chart as the following **dependencies:** + +- [Grafana Operator](https://artifacthub.io/packages/olm/community-operators/grafana-operator) + +- [Prometheus Operator](https://artifacthub.io/packages/olm/community-operators/prometheus) + +These dependencies must be present before installing this chart. to install run the following commands: + +``` bash + +git clone -b development https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring.git + +helm upgrade -i nfc_monitoring kubernetes_monitoring/ + +``` + + +## Template Values + +The values file included in this helm chart is below. + +!!! note + This values file is from the development branch and as such may not reflect the current version you have deployed. If you require a specific version, head on over to the git repository and select the git tag that matches the version you are after. + +``` yaml title="values.yaml" linenums="1" + +--8<-- "values.yaml" ``` From a615b4922f59024c17056f6f6bc802ff2188c787 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 19:02:04 +0930 Subject: [PATCH 27/61] feat(alertmanager): added dashboard !1 --- templates/GrafanaDashboard-AlertManager.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 templates/GrafanaDashboard-AlertManager.yaml diff --git a/templates/GrafanaDashboard-AlertManager.yaml b/templates/GrafanaDashboard-AlertManager.yaml new file mode 100644 index 0000000..cc9ef9e --- /dev/null +++ b/templates/GrafanaDashboard-AlertManager.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: alertmanager + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + allowCrossNamespaceImport: true + folder: No Fuss Monitoring + resyncPeriod: 1d + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + grafanaCom: + id: 9578 + revision: 4 # as @ 19-09-23 \ No newline at end of file From 1dc85b45180b56c7ed8f2a27246f0771f8672939 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 19:02:17 +0930 Subject: [PATCH 28/61] feat(ceph): added dashboard !1 --- templates/GrafanaDashboard-Ceph.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 templates/GrafanaDashboard-Ceph.yaml diff --git a/templates/GrafanaDashboard-Ceph.yaml b/templates/GrafanaDashboard-Ceph.yaml new file mode 100644 index 0000000..38d735d --- /dev/null +++ b/templates/GrafanaDashboard-Ceph.yaml @@ -0,0 +1,21 @@ +--- +{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}} +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: ceph + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + allowCrossNamespaceImport: true + folder: No Fuss Monitoring + resyncPeriod: 1d + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + grafanaCom: + id: 2842 + revision: 17 # as @ 19-09-23 + +{{- end -}} \ No newline at end of file From 9e3f33ae5679e92b1d5097fb808943c59f31aa3b Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 19:02:32 +0930 Subject: [PATCH 29/61] feat(node): added dashboard !1 --- .../GrafanaDashboard-node-exporter-full.yaml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 templates/GrafanaDashboard-node-exporter-full.yaml diff --git a/templates/GrafanaDashboard-node-exporter-full.yaml b/templates/GrafanaDashboard-node-exporter-full.yaml new file mode 100644 index 0000000..b5b9abe --- /dev/null +++ b/templates/GrafanaDashboard-node-exporter-full.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: node-exporter + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + allowCrossNamespaceImport: true + folder: No Fuss Monitoring + resyncPeriod: 1d + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + grafanaCom: + id: 1860 + revision: 32 # as @ 19-09-23 \ No newline at end of file From 63c0381bd15b077537667979ec8e3dd2e6503090 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 23:18:27 +0930 Subject: [PATCH 30/61] feat(metrics): Calico service and monitor added !1 --- docs/projects/kubernetes_monitoring/index.md | 6 ++++ templates/Service-CalicoMetrics.yaml | 20 +++++++++++ templates/ServiceMonitor-Calico.yaml | 36 ++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 templates/Service-CalicoMetrics.yaml create mode 100644 templates/ServiceMonitor-Calico.yaml diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md index 9fc69af..bbc4e72 100644 --- a/docs/projects/kubernetes_monitoring/index.md +++ b/docs/projects/kubernetes_monitoring/index.md @@ -37,6 +37,12 @@ This helm chart started off with components from multiple open-source projects. - Grafana as the visualization frontend +- Service monitors + + - Calico, _if Selected_ + + > enable calico metrics with `kubectl patch felixconfiguration default --type merge --patch '{"spec":{"prometheusMetricsEnabled": true}}'` + ## Installation diff --git a/templates/Service-CalicoMetrics.yaml b/templates/Service-CalicoMetrics.yaml new file mode 100644 index 0000000..e81c30b --- /dev/null +++ b/templates/Service-CalicoMetrics.yaml @@ -0,0 +1,20 @@ +--- +{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}} + +apiVersion: v1 +kind: Service +metadata: + name: calico-metrics + namespace: kube-system + labels: + k8s-app: calico-node +spec: + clusterIP: None + selector: + k8s-app: calico-node + ports: + - port: 9091 + name: metrics + targetPort: 9091 + +{{- end -}} diff --git a/templates/ServiceMonitor-Calico.yaml b/templates/ServiceMonitor-Calico.yaml new file mode 100644 index 0000000..d40a74e --- /dev/null +++ b/templates/ServiceMonitor-Calico.yaml @@ -0,0 +1,36 @@ +--- +{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: calico + app.kubernetes.io/component: networking + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + k8s-app: calico-node + name: calico + namespace: kube-system +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: metrics + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: http + # tlsConfig: + # insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + k8s-app: calico-node + +{{- end -}} From aba3cb22a97d180185eef2350988720542cb1298 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 23:19:14 +0930 Subject: [PATCH 31/61] feat(loki): removed service monitor a service monitor is included with loki !1 --- templates/ServiceMonitor-Loki.yaml | 37 ------------------------------ 1 file changed, 37 deletions(-) delete mode 100644 templates/ServiceMonitor-Loki.yaml diff --git a/templates/ServiceMonitor-Loki.yaml b/templates/ServiceMonitor-Loki.yaml deleted file mode 100644 index 5a90a5c..0000000 --- a/templates/ServiceMonitor-Loki.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -{{- if .Values.nfc_monitoring.loki.enabled | default false -}} - -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - app.kubernetes.io/name: loki - app.kubernetes.io/component: logging - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/version: {{ $.Chart.Version }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - - name: loki - namespace: {{ .Values.nfc_monitoring.loki.namespace }} -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 15s - port: loki - # relabelings: - # - action: replace - # regex: (.*) - # replacement: $1 - # sourceLabels: - # - __meta_kubernetes_pod_node_name - # targetLabel: instance - scheme: http - # tlsConfig: - # insecureSkipVerify: true - targetLabels: - - cluster - jobLabel: app.kubernetes.io/name - selector: -{{ .Values.nfc_monitoring.loki.ServiceMonitor.selector | toYaml | indent 4 }} - -{{- end -}} From a28126555f594aa805016c5948151e860e5c93ad Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 23:19:45 +0930 Subject: [PATCH 32/61] feat(loki): add prometheus alerts and rules for mixins !1 --- templates/PrometheusRule-loki.yaml | 112 +++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 templates/PrometheusRule-loki.yaml diff --git a/templates/PrometheusRule-loki.yaml b/templates/PrometheusRule-loki.yaml new file mode 100644 index 0000000..ef5c24f --- /dev/null +++ b/templates/PrometheusRule-loki.yaml @@ -0,0 +1,112 @@ +--- +{{- if .Values.nfc_monitoring.loki.enabled | default false -}} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: logging + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: loki + namespace: monitoring +spec: + groups: + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + message: | + {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: | + {{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: | + {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + message: | + {{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: 5m + labels: + severity: warning + +{{- end -}} From 4ced83b90d98e4805550cb2b5fda55c705e8c8bb Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 19 Sep 2023 23:20:57 +0930 Subject: [PATCH 33/61] feat(loik): be able to specify full loki url --- .helmignore | 7 +++++++ templates/ConfigMap-GrafanaAgent.yaml | 27 +++++++++++++++++++++++++-- templates/GrafanaDatasource-loki.yaml | 2 +- values.yaml | 4 +++- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/.helmignore b/.helmignore index 0e8a0eb..bda285e 100644 --- a/.helmignore +++ b/.helmignore @@ -21,3 +21,10 @@ .idea/ *.tmproj .vscode/ + +# git repo files +monitoring-mixins/ +ceph-mixins/ +gitlab-ci/ +website-template/ +docs/ diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml index d866aee..70c1544 100644 --- a/templates/ConfigMap-GrafanaAgent.yaml +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -25,7 +25,7 @@ data: - name: journal clients: - - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:3100/loki/api/v1/push + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push scrape_configs: - job_name: systemd-journal @@ -202,7 +202,7 @@ data: - name: kubernetes clients: - - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:3100/loki/api/v1/push + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push # basic_auth: # username: YOUR_LOKI_USERNAME # password: YOUR_LOKI_PASSWORD @@ -279,3 +279,26 @@ data: include_exporter_metrics: true enable_collectors: - uname + + + syslog_server.yaml: | + # REF: https://grafana.com/docs/loki/latest/send-data/promtail/configuration/#example-syslog-config + server: + http_listen_port: 9080 + grpc_listen_port: 0 + + positions: + filename: /tmp/syslog_server_positions.yaml + + clients: + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push + + scrape_configs: + - job_name: syslog + syslog: + listen_address: 0.0.0.0:1514 + labels: + job: "syslog" + relabel_configs: + - source_labels: ['__syslog_message_hostname'] + target_label: 'host' diff --git a/templates/GrafanaDatasource-loki.yaml b/templates/GrafanaDatasource-loki.yaml index 18e9f5f..0b2dfc7 100644 --- a/templates/GrafanaDatasource-loki.yaml +++ b/templates/GrafanaDatasource-loki.yaml @@ -27,7 +27,7 @@ spec: type: loki access: proxy # basicAuth: true - url: "http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:3100" + url: "http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}" isDefault: false # user: admin jsonData: diff --git a/values.yaml b/values.yaml index 81fa8cb..418bff3 100644 --- a/values.yaml +++ b/values.yaml @@ -8,6 +8,7 @@ nfc_monitoring: kubernetes: cluster_dns_name: cluster.local + networking: calico alert_manager: @@ -45,7 +46,8 @@ nfc_monitoring: namespace: loki - service_name: loki + service_name: loki-gateway + service_port: 80 ServiceMonitor: selector: From 4e742ee2d5dab3e2781d62e075f4a8adcf9c6b2a Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 01:00:53 +0930 Subject: [PATCH 34/61] feat(metrics): node scraper set to 5s !1 --- templates/ServiceMonitor-Node.yaml | 2 +- templates/ServiceMonitor-node-exporter.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/ServiceMonitor-Node.yaml b/templates/ServiceMonitor-Node.yaml index 1e21be8..e1d32c3 100644 --- a/templates/ServiceMonitor-Node.yaml +++ b/templates/ServiceMonitor-Node.yaml @@ -15,7 +15,7 @@ spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 10s + interval: 5s honorLabels: true path: /metrics port: grafana-metrics diff --git a/templates/ServiceMonitor-node-exporter.yaml b/templates/ServiceMonitor-node-exporter.yaml index 478326f..9620458 100644 --- a/templates/ServiceMonitor-node-exporter.yaml +++ b/templates/ServiceMonitor-node-exporter.yaml @@ -14,7 +14,7 @@ spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 10s + interval: 5s honorLabels: true path: /integrations/node_exporter/metrics port: grafana-metrics From 64d612047f35349d149c8c0d4d1732908284ca9f Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 01:01:25 +0930 Subject: [PATCH 35/61] refactor: use loki ns var !1 --- templates/Role-SpecificNamespaces-prometheus.yaml | 2 +- templates/RoleBinding-SpecificNamespaces-prometheus.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/Role-SpecificNamespaces-prometheus.yaml b/templates/Role-SpecificNamespaces-prometheus.yaml index dd91da8..f212f70 100644 --- a/templates/Role-SpecificNamespaces-prometheus.yaml +++ b/templates/Role-SpecificNamespaces-prometheus.yaml @@ -172,7 +172,7 @@ items: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: loki + namespace: {{ .Values.nfc_monitoring.loki.namespace }} rules: - apiGroups: - "" diff --git a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml index b157aa7..7973ab0 100644 --- a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml +++ b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml @@ -99,7 +99,7 @@ items: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: loki + namespace: {{ .Values.nfc_monitoring.loki.namespace }} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role From cbac640af3ffdb16c6b78ecfcf86205b7804e0fd Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 01:02:00 +0930 Subject: [PATCH 36/61] feat(promtail): filter devices to not include temp/virtual !1 --- templates/ConfigMap-GrafanaAgent.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml index 70c1544..1b37e0a 100644 --- a/templates/ConfigMap-GrafanaAgent.yaml +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -272,7 +272,12 @@ data: # collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/) filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)" - filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$" + #filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$" + filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" + + + netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$" + netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$" scrape_integration: true From 3f2204cf3132deea4e4750ec7cf97e512986423e Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 01:02:41 +0930 Subject: [PATCH 37/61] feat(deployment): grafana-agent dnspoly clusterfirst !1 --- templates/Daemonset-GrafanaAgent.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/Daemonset-GrafanaAgent.yaml b/templates/Daemonset-GrafanaAgent.yaml index b04a6c3..56281ec 100644 --- a/templates/Daemonset-GrafanaAgent.yaml +++ b/templates/Daemonset-GrafanaAgent.yaml @@ -92,6 +92,7 @@ spec: - name: agent-data mountPath: "/etc/agent/data" readOnly: false + dnsPolicy: ClusterFirst volumes: - hostPath: path: /sys From 3281cd6552339ee16c7ae94ae9676f26c3d8000e Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 12:53:34 +0930 Subject: [PATCH 38/61] refactor(prometheus): use values ns var !1 --- templates/ClusterRoleBinding-prometheus.yaml | 2 +- templates/PodDisruptionBudget-prometheus.yaml | 2 +- templates/PrometheusRule-alertmanager.yaml | 2 +- templates/PrometheusRule-grafana-agent.yaml | 2 +- templates/PrometheusRule-grafana.yaml | 2 +- templates/PrometheusRule-kubePrometheus.yaml | 2 +- .../PrometheusRule-kubeStateMetrics.yaml | 2 +- ...PrometheusRule-kubernetesControlPlane.yaml | 2 +- templates/PrometheusRule-loki.yaml | 2 +- templates/PrometheusRule-nodeExporter.yaml | 2 +- templates/PrometheusRule-prometheus.yaml | 2 +- .../Role-SpecificNamespaces-prometheus.yaml | 2 +- templates/RoleBinding-Config-prometheus.yaml | 4 ++-- ...Binding-SpecificNamespaces-prometheus.yaml | 20 +++++++++---------- templates/RoleConfig-prometheus.yaml | 2 +- templates/Service-prometheus.yaml | 2 +- templates/ServiceAccount-prometheus.yaml | 2 +- templates/ServiceMonitor-prometheus.yaml | 2 +- templates/prometheusRule-ceph.yaml | 2 +- 19 files changed, 29 insertions(+), 29 deletions(-) diff --git a/templates/ClusterRoleBinding-prometheus.yaml b/templates/ClusterRoleBinding-prometheus.yaml index 5468c07..b54fa91 100644 --- a/templates/ClusterRoleBinding-prometheus.yaml +++ b/templates/ClusterRoleBinding-prometheus.yaml @@ -17,4 +17,4 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/PodDisruptionBudget-prometheus.yaml b/templates/PodDisruptionBudget-prometheus.yaml index 9ee858e..7a5374d 100644 --- a/templates/PodDisruptionBudget-prometheus.yaml +++ b/templates/PodDisruptionBudget-prometheus.yaml @@ -10,7 +10,7 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: minAvailable: 1 selector: diff --git a/templates/PrometheusRule-alertmanager.yaml b/templates/PrometheusRule-alertmanager.yaml index 988dad9..95ef2a1 100644 --- a/templates/PrometheusRule-alertmanager.yaml +++ b/templates/PrometheusRule-alertmanager.yaml @@ -12,7 +12,7 @@ metadata: prometheus: k8s role: alert-rules name: alertmanager-main-rules - namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: alertmanager.rules diff --git a/templates/PrometheusRule-grafana-agent.yaml b/templates/PrometheusRule-grafana-agent.yaml index 6467f12..640733c 100644 --- a/templates/PrometheusRule-grafana-agent.yaml +++ b/templates/PrometheusRule-grafana-agent.yaml @@ -11,7 +11,7 @@ metadata: prometheus: k8s role: grafana-agent-promtail name: grafana-agent - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: grafana_agent diff --git a/templates/PrometheusRule-grafana.yaml b/templates/PrometheusRule-grafana.yaml index 71f3d0d..b505e52 100644 --- a/templates/PrometheusRule-grafana.yaml +++ b/templates/PrometheusRule-grafana.yaml @@ -11,7 +11,7 @@ metadata: prometheus: k8s role: alert-rules name: grafana-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: GrafanaAlerts diff --git a/templates/PrometheusRule-kubePrometheus.yaml b/templates/PrometheusRule-kubePrometheus.yaml index 42ad467..268491b 100644 --- a/templates/PrometheusRule-kubePrometheus.yaml +++ b/templates/PrometheusRule-kubePrometheus.yaml @@ -11,7 +11,7 @@ metadata: prometheus: k8s role: alert-rules name: kube-prometheus-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: general.rules diff --git a/templates/PrometheusRule-kubeStateMetrics.yaml b/templates/PrometheusRule-kubeStateMetrics.yaml index 18d2bb0..6df5352 100644 --- a/templates/PrometheusRule-kubeStateMetrics.yaml +++ b/templates/PrometheusRule-kubeStateMetrics.yaml @@ -11,7 +11,7 @@ metadata: prometheus: k8s role: alert-rules name: kube-state-metrics-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: kube-state-metrics diff --git a/templates/PrometheusRule-kubernetesControlPlane.yaml b/templates/PrometheusRule-kubernetesControlPlane.yaml index a41dc70..6845600 100644 --- a/templates/PrometheusRule-kubernetesControlPlane.yaml +++ b/templates/PrometheusRule-kubernetesControlPlane.yaml @@ -10,7 +10,7 @@ metadata: prometheus: k8s role: alert-rules name: kubernetes-monitoring-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: kubernetes-apps diff --git a/templates/PrometheusRule-loki.yaml b/templates/PrometheusRule-loki.yaml index ef5c24f..13a424f 100644 --- a/templates/PrometheusRule-loki.yaml +++ b/templates/PrometheusRule-loki.yaml @@ -12,7 +12,7 @@ metadata: prometheus: k8s role: alert-rules name: loki - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: loki_rules diff --git a/templates/PrometheusRule-nodeExporter.yaml b/templates/PrometheusRule-nodeExporter.yaml index 5063919..24bfbe2 100644 --- a/templates/PrometheusRule-nodeExporter.yaml +++ b/templates/PrometheusRule-nodeExporter.yaml @@ -11,7 +11,7 @@ metadata: prometheus: k8s role: alert-rules name: node-exporter-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: node-exporter diff --git a/templates/PrometheusRule-prometheus.yaml b/templates/PrometheusRule-prometheus.yaml index e5f0d5d..28e9b36 100644 --- a/templates/PrometheusRule-prometheus.yaml +++ b/templates/PrometheusRule-prometheus.yaml @@ -11,7 +11,7 @@ metadata: prometheus: k8s role: alert-rules name: prometheus-k8s-prometheus-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: prometheus diff --git a/templates/Role-SpecificNamespaces-prometheus.yaml b/templates/Role-SpecificNamespaces-prometheus.yaml index f212f70..959efde 100644 --- a/templates/Role-SpecificNamespaces-prometheus.yaml +++ b/templates/Role-SpecificNamespaces-prometheus.yaml @@ -90,7 +90,7 @@ items: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} rules: - apiGroups: - "" diff --git a/templates/RoleBinding-Config-prometheus.yaml b/templates/RoleBinding-Config-prometheus.yaml index 06b9035..918804f 100644 --- a/templates/RoleBinding-Config-prometheus.yaml +++ b/templates/RoleBinding-Config-prometheus.yaml @@ -10,7 +10,7 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s-config - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role @@ -18,4 +18,4 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml index 7973ab0..d5b4dcc 100644 --- a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml +++ b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml @@ -20,7 +20,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -41,7 +41,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} @@ -64,7 +64,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 @@ -86,7 +86,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -107,7 +107,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -128,7 +128,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} @@ -175,7 +175,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: @@ -195,7 +195,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: @@ -215,7 +215,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: @@ -235,7 +235,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/RoleConfig-prometheus.yaml b/templates/RoleConfig-prometheus.yaml index 28da8b3..3628555 100644 --- a/templates/RoleConfig-prometheus.yaml +++ b/templates/RoleConfig-prometheus.yaml @@ -10,7 +10,7 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s-config - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} rules: - apiGroups: - "" diff --git a/templates/Service-prometheus.yaml b/templates/Service-prometheus.yaml index 5a6d35e..c44eef8 100644 --- a/templates/Service-prometheus.yaml +++ b/templates/Service-prometheus.yaml @@ -10,7 +10,7 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: ports: - name: web diff --git a/templates/ServiceAccount-prometheus.yaml b/templates/ServiceAccount-prometheus.yaml index d1f3a21..871071f 100644 --- a/templates/ServiceAccount-prometheus.yaml +++ b/templates/ServiceAccount-prometheus.yaml @@ -11,4 +11,4 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/ServiceMonitor-prometheus.yaml b/templates/ServiceMonitor-prometheus.yaml index 3512aca..824d709 100644 --- a/templates/ServiceMonitor-prometheus.yaml +++ b/templates/ServiceMonitor-prometheus.yaml @@ -9,7 +9,7 @@ metadata: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: endpoints: - interval: 30s diff --git a/templates/prometheusRule-ceph.yaml b/templates/prometheusRule-ceph.yaml index dc53b82..61c279b 100644 --- a/templates/prometheusRule-ceph.yaml +++ b/templates/prometheusRule-ceph.yaml @@ -13,7 +13,7 @@ metadata: prometheus: k8s role: alert-rules name: ceph-rules - namespace: monitoring + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: "cluster health" From 8b966b0f0ac33571128c959ed02d985cc805668f Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 18:55:49 +0930 Subject: [PATCH 39/61] feat(grafana-agent): expand env vars in config this allows using the hosts env vars in the config file !1 --- templates/Daemonset-GrafanaAgent.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/Daemonset-GrafanaAgent.yaml b/templates/Daemonset-GrafanaAgent.yaml index 56281ec..6a77656 100644 --- a/templates/Daemonset-GrafanaAgent.yaml +++ b/templates/Daemonset-GrafanaAgent.yaml @@ -44,6 +44,7 @@ spec: - args: - --server.http.address=0.0.0.0:12345 - --config.file=/etc/agent/agent.yaml + - --config.expand-env=true name: grafana-agent image: "{{ .Values.nfc_monitoring.grafana_agent.image.name }}:{{ .Values.nfc_monitoring.grafana_agent.image.tag }}" #imagePullPolicy: Never From 2331bcbba35d561b22b35f7aa8c19cea087999fd Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 21 Sep 2023 19:20:43 +0930 Subject: [PATCH 40/61] feat(logging): add log files from /var/log !1 --- templates/ConfigMap-GrafanaAgent.yaml | 68 +++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml index 1b37e0a..1debdec 100644 --- a/templates/ConfigMap-GrafanaAgent.yaml +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -116,6 +116,66 @@ data: - labels: level: + + - name: log-files + clients: + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push + positions: + filename: /tmp/var-logfile.yaml + target_config: + sync_period: 10s + scrape_configs: + + - job_name: var-logs + static_configs: + - targets: [localhost] + labels: + job_name: 'aptitude' + __path__: /var/log/apt/*.log + - targets: [localhost] + labels: + job_name: 'dpkg' + __path__: /var/log/dpkg.log + - targets: [localhost] + labels: + job_name: 'messages' + __path__: /var/log/messages + - targets: [localhost] + labels: + job_name: 'syslog' + __path__: /var/log/syslog + - targets: [localhost] + labels: + job_name: 'kernlog' + __path__: /var/log/kern.log + - targets: [localhost] + labels: + job_name: 'auth-log' + __path__: /var/log/auth.log + - targets: [localhost] + labels: + job_name: 'boot-log' + __path__: /var/log/boot.log + - targets: [localhost] + labels: + job_name: 'daemon-log' + __path__: /var/log/daemon.log + - targets: [localhost] + labels: + job_name: 'faillog-log' + __path__: /var/log/faillog + - targets: [localhost] + labels: + job_name: 'lastlog-log' + __path__: /var/log/lastlog + + relabel_configs: + - target_label: hostname + replacement: "${HOSTNAME}" + - target_label: job + replacement: log-files + + # - job_name: varlogs # static_configs: # - targets: [localhost] @@ -238,10 +298,12 @@ data: source_labels: - __meta_kubernetes_namespace target_label: namespace + # - target_label: job + # # source_labels: + # # - __meta_kubernetes_namespace + # replacement: loki/ingester_grafana-agent - target_label: job - # source_labels: - # - __meta_kubernetes_namespace - replacement: loki/ingester_grafana-agent + replacement: grafana-agent - action: replace source_labels: - __meta_kubernetes_pod_name From d189dfe0ee94a7e809dea19f1e0cf2616e173483 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 13:39:55 +0930 Subject: [PATCH 41/61] chore(versions): update image versions alertmanager,grafana !1 --- values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/values.yaml b/values.yaml index 418bff3..a558dac 100644 --- a/values.yaml +++ b/values.yaml @@ -14,14 +14,14 @@ nfc_monitoring: alert_manager: image: name: quay.io/prometheus/alertmanager - tag: 'v0.25.0' + tag: 'v0.26.0' namespace: alerting grafana: image: name: grafana/grafana - tag: '10.0.5' + tag: '10.1.2' # '10.0.5' namespace: grafana replicas: 1 From 836cc111e092046988aba7b9b1494b1432313add Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 14:02:36 +0930 Subject: [PATCH 42/61] feat(prometheus): add affinity to values !1 --- templates/Prometheus-prometheus.yaml | 2 ++ values.yaml | 29 +++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/templates/Prometheus-prometheus.yaml b/templates/Prometheus-prometheus.yaml index 25a08af..919ef7d 100644 --- a/templates/Prometheus-prometheus.yaml +++ b/templates/Prometheus-prometheus.yaml @@ -47,4 +47,6 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} + affinity: + {{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }} version: 2.42.0 diff --git a/values.yaml b/values.yaml index a558dac..8f075cb 100644 --- a/values.yaml +++ b/values.yaml @@ -75,11 +75,38 @@ nfc_monitoring: prometheus: + image: name: prom/prometheus - tag: 'v2.42.0' + tag: 'v2.47.0' + namespace: monitoring + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 100 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/storage + operator: DoesNotExist + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - prometheus + topologyKey: kubernetes.io/hostname + weight: 10 + prometheus_adaptor: image: From c742eea38d8586f3de42db377f953b35c593966c Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 14:09:23 +0930 Subject: [PATCH 43/61] feat(prometheus_adaptor): add affinity to values !1 --- templates/Deployment-prometheus-adapter.yaml | 2 ++ values.yaml | 29 +++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/templates/Deployment-prometheus-adapter.yaml b/templates/Deployment-prometheus-adapter.yaml index 9edd8bc..3cbab58 100644 --- a/templates/Deployment-prometheus-adapter.yaml +++ b/templates/Deployment-prometheus-adapter.yaml @@ -35,6 +35,8 @@ spec: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} spec: + affinity: + {{- toYaml .Values.nfc_monitoring.prometheus_adaptor.affinity | nindent 8 }} automountServiceAccountToken: true containers: - args: diff --git a/values.yaml b/values.yaml index 8f075cb..6982795 100644 --- a/values.yaml +++ b/values.yaml @@ -109,12 +109,39 @@ nfc_monitoring: prometheus_adaptor: + image: name: registry.k8s.io/prometheus-adapter/prometheus-adapter tag: 'v0.11.1' + namespace: monitoring - + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 100 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/storage + operator: DoesNotExist + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - prometheus + topologyKey: kubernetes.io/hostname + weight: 10 + + additions: ceph: From 7add74fbb1cae8613df3a7405889198c78b091f0 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 15:28:00 +0930 Subject: [PATCH 44/61] refactor: yaml object ordering !1 --- templates/Prometheus-prometheus.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/templates/Prometheus-prometheus.yaml b/templates/Prometheus-prometheus.yaml index 919ef7d..d526374 100644 --- a/templates/Prometheus-prometheus.yaml +++ b/templates/Prometheus-prometheus.yaml @@ -11,6 +11,8 @@ metadata: name: k8s namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}" spec: + affinity: + {{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }} alerting: alertmanagers: - apiVersion: v2 @@ -47,6 +49,6 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - affinity: + {{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }} version: 2.42.0 From 83fb30b4ed757d50b9f53bf3eba3de9cc38e0b55 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 15:28:58 +0930 Subject: [PATCH 45/61] feat(prometheus): add storage to values !1 --- templates/Prometheus-prometheus.yaml | 2 +- values.yaml | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/templates/Prometheus-prometheus.yaml b/templates/Prometheus-prometheus.yaml index d526374..66dacd0 100644 --- a/templates/Prometheus-prometheus.yaml +++ b/templates/Prometheus-prometheus.yaml @@ -49,6 +49,6 @@ spec: serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} - + storage: {{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }} version: 2.42.0 diff --git a/values.yaml b/values.yaml index 6982795..769af8d 100644 --- a/values.yaml +++ b/values.yaml @@ -107,6 +107,15 @@ nfc_monitoring: topologyKey: kubernetes.io/hostname weight: 10 + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 40Gi + prometheus_adaptor: From 0d1d0a34d8738ae408226479e0249f5cd1873551 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 15:30:15 +0930 Subject: [PATCH 46/61] feat(grafana): add affinity to values !1 --- templates/Grafana-Grafana.yaml | 2 ++ values.yaml | 38 +++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/templates/Grafana-Grafana.yaml b/templates/Grafana-Grafana.yaml index f0aca0f..07e5784 100644 --- a/templates/Grafana-Grafana.yaml +++ b/templates/Grafana-Grafana.yaml @@ -53,6 +53,8 @@ spec: app.kubernetes.io/version: {{ $.Chart.Version }} spec: + affinity: + {{- toYaml .Values.nfc_monitoring.grafana.affinity | nindent 12 }} containers: - name: grafana image: "{{ .Values.nfc_monitoring.grafana.image.name }}:{{ .Values.nfc_monitoring.grafana.image.tag }}" diff --git a/values.yaml b/values.yaml index 769af8d..9ad1ef0 100644 --- a/values.yaml +++ b/values.yaml @@ -19,15 +19,43 @@ nfc_monitoring: grafana: - image: - name: grafana/grafana - tag: '10.1.2' # '10.0.5' - namespace: grafana - replicas: 1 admin_user: admin admin_password: admin + image: + name: grafana/grafana + tag: '10.1.2' # '10.0.5' + + namespace: grafana + + replicas: 1 + + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 100 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/storage + operator: DoesNotExist + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - prometheus + topologyKey: kubernetes.io/hostname + weight: 10 + grafana_agent: image: From 89f0feae08bde0e1545c492b58dd1fbdb9ac3a45 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 18:42:38 +0930 Subject: [PATCH 47/61] fix(grafana): dont use operator readiness prob operator defines endpoint :3000/api/health, which fails with invalid argument !1 --- templates/Grafana-Grafana.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/templates/Grafana-Grafana.yaml b/templates/Grafana-Grafana.yaml index 07e5784..737d0b4 100644 --- a/templates/Grafana-Grafana.yaml +++ b/templates/Grafana-Grafana.yaml @@ -71,6 +71,14 @@ spec: name: grafana-http readinessProbe: failureThreshold: 3 + httpGet: + path: /login + port: 3000 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 3 resources: limits: cpu: 2000m From e2540a955f08b7dcafa159abb773f095b920738b Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 18:43:54 +0930 Subject: [PATCH 48/61] feat(servicemonitor): for prometheus, use pod name for instance !1 --- templates/ServiceMonitor-prometheus.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/templates/ServiceMonitor-prometheus.yaml b/templates/ServiceMonitor-prometheus.yaml index 824d709..e17d8fb 100644 --- a/templates/ServiceMonitor-prometheus.yaml +++ b/templates/ServiceMonitor-prometheus.yaml @@ -14,6 +14,13 @@ spec: endpoints: - interval: 30s port: web + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_name + targetLabel: instance - interval: 30s port: reloader-web selector: From 3a4ca30e68c85758fbfa1325a7fc6c17c81926ad Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 23:02:56 +0930 Subject: [PATCH 49/61] feat(servicemonitor): ceph use cluster name for instance !1 --- templates/ServiceMonitor-ceph.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/ServiceMonitor-ceph.yaml b/templates/ServiceMonitor-ceph.yaml index 71989d6..b57b31b 100644 --- a/templates/ServiceMonitor-ceph.yaml +++ b/templates/ServiceMonitor-ceph.yaml @@ -22,7 +22,7 @@ spec: regex: (.*) replacement: $1 sourceLabels: - - __meta_kubernetes_pod_node_name + - __meta_kubernetes_pod_label_app_kubernetes_io_part_of targetLabel: instance - sourceLabels: - __metrics_path__ From 4198ac78da4733b36b94587dce9fd0a19664106d Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 23:03:37 +0930 Subject: [PATCH 50/61] docs: add blurb on values !1 --- docs/projects/kubernetes_monitoring/index.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md index bbc4e72..3410336 100644 --- a/docs/projects/kubernetes_monitoring/index.md +++ b/docs/projects/kubernetes_monitoring/index.md @@ -43,6 +43,7 @@ This helm chart started off with components from multiple open-source projects. > enable calico metrics with `kubectl patch felixconfiguration default --type merge --patch '{"spec":{"prometheusMetricsEnabled": true}}'` +- Customization of components through values.yaml. _See values.yaml for more details._ ## Installation From 944d6153312703f41f5d579b3326684c0f7312a7 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 23:11:38 +0930 Subject: [PATCH 51/61] fix(ceph): PromRule CephPGImbalance adjusted to group by node balancing is done by hostname not osd. !1 --- templates/prometheusRule-ceph.yaml | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/templates/prometheusRule-ceph.yaml b/templates/prometheusRule-ceph.yaml index 61c279b..e6a9a28 100644 --- a/templates/prometheusRule-ceph.yaml +++ b/templates/prometheusRule-ceph.yaml @@ -257,10 +257,25 @@ spec: description: "OSD {{ `{{` }} $labels.ceph_daemon }} on {{ `{{` }} $labels.hostname }} deviates by more than 30% from average PG count." summary: "PGs are not balanced across OSDs" expr: | - abs( - ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / - on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + # abs( + # ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / + # on (job) group_left avg(ceph_osd_numpg > 0) by (hostname) + # ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + < + scalar( + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + - + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) * 0.3 )) + ) + or + (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + > + scalar( + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + + + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) * 0.3 )) + ) for: "5m" labels: oid: "1.3.6.1.4.1.50495.1.2.1.4.5" From 82cd91a8596afca592292840cd6073ba5cb1e18b Mon Sep 17 00:00:00 2001 From: Jon Date: Sun, 24 Sep 2023 00:07:25 +0930 Subject: [PATCH 52/61] docs: added missing attribution !1 --- docs/projects/kubernetes_monitoring/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md index 3410336..e81007c 100644 --- a/docs/projects/kubernetes_monitoring/index.md +++ b/docs/projects/kubernetes_monitoring/index.md @@ -24,6 +24,8 @@ Intentionally we have kept this helm chart to the monitoring components only. Ke This helm chart started off with components from multiple open-source projects. As such attribution is warrented, so head on over to thier projects and give them a star. Projects used to create the building blocks of this helm chart are +- [ceph](https://github.com/ceph/ceph) _mixin alerts/rules_ + - [Kube prometheus](https://github.com/prometheus-operator/kube-prometheus) - [prometheus adaptor](https://github.com/kubernetes-sigs/prometheus-adapter) From 8f53b0fa07eef6153e4583f6fe86f3f7fd5027f4 Mon Sep 17 00:00:00 2001 From: Jon Date: Sun, 24 Sep 2023 06:38:04 +0930 Subject: [PATCH 53/61] feat(grafana): sidecar for loading dashboards from configmap !1 --- templates/ClusterRole-grafana-SideCar.yaml | 18 ++++ .../ClusterRoleBinding-Grafana-SideCar.yaml | 22 +++++ templates/ConfigMap-GrafanaProvisioning.yaml | 31 +++++++ templates/Grafana-Grafana.yaml | 85 ++++++++++++------- values.yaml | 12 +++ 5 files changed, 137 insertions(+), 31 deletions(-) create mode 100644 templates/ClusterRole-grafana-SideCar.yaml create mode 100644 templates/ClusterRoleBinding-Grafana-SideCar.yaml create mode 100644 templates/ConfigMap-GrafanaProvisioning.yaml diff --git a/templates/ClusterRole-grafana-SideCar.yaml b/templates/ClusterRole-grafana-SideCar.yaml new file mode 100644 index 0000000..42bf948 --- /dev/null +++ b/templates/ClusterRole-grafana-SideCar.yaml @@ -0,0 +1,18 @@ +{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: Grafana-Sidecar +rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "watch", "list"] +{{- end }} diff --git a/templates/ClusterRoleBinding-Grafana-SideCar.yaml b/templates/ClusterRoleBinding-Grafana-SideCar.yaml new file mode 100644 index 0000000..367f863 --- /dev/null +++ b/templates/ClusterRoleBinding-Grafana-SideCar.yaml @@ -0,0 +1,22 @@ +{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}} +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: Grafana-Sidecar +roleRef: + kind: ClusterRole + name: Grafana-Sidecar + apiGroup: rbac.authorization.k8s.io +subjects: + - kind: ServiceAccount + name: grafana + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" +{{- end }} diff --git a/templates/ConfigMap-GrafanaProvisioning.yaml b/templates/ConfigMap-GrafanaProvisioning.yaml new file mode 100644 index 0000000..6b35943 --- /dev/null +++ b/templates/ConfigMap-GrafanaProvisioning.yaml @@ -0,0 +1,31 @@ +{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}} +--- +# Provisioning config +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: provisioning-config + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" +data: + provisioning.yaml: |- + apiVersion: 1 + providers: + - name: 'configmap-dashboard-provider' + orgId: 1 + folder: '' + folderUid: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true +{{- end }} diff --git a/templates/Grafana-Grafana.yaml b/templates/Grafana-Grafana.yaml index 737d0b4..8ff2743 100644 --- a/templates/Grafana-Grafana.yaml +++ b/templates/Grafana-Grafana.yaml @@ -55,6 +55,9 @@ spec: spec: affinity: {{- toYaml .Values.nfc_monitoring.grafana.affinity | nindent 12 }} + {{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }} + automountServiceAccountToken: true + {{ end }} containers: - name: grafana image: "{{ .Values.nfc_monitoring.grafana.image.name }}:{{ .Values.nfc_monitoring.grafana.image.tag }}" @@ -95,9 +98,33 @@ spec: - mountPath: /etc/grafana/provisioning/plugins name: plugin-config readOnly: false + {{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }} + - mountPath: /etc/grafana/provisioning/dashboards + name: provisioning-config + - mountPath: /var/lib/grafana/dashboards + name: dashboards + + - image: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.name }}:{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.tag}}" + name: k8s-sidecar + env: + - name: LABEL + value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_name }}" + - name: LABEL_VALUE + value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_value }}" + - name: FOLDER + value: /var/lib/grafana/dashboards + - name: NAMESPACE + value: grafana + - name: RESOURCE + value: configmap + volumeMounts: + - mountPath: /var/lib/grafana/dashboards + name: dashboards + {{ end }} + securityContext: + fsGroup: 65534 volumes: - - name: grafana-storage - emptyDir: {} + #- name: grafana-storage - name: plugin-config configMap: # Provide the name of the ConfigMap you want to mount. @@ -106,35 +133,31 @@ spec: items: - key: "oncall-app.yaml" path: "oncall-app.yaml" - # - volumeClaimTemplates: - # - metadata: - # name: data - # labels: - # app.kubernetes.io/name: loki - # app.kubernetes.io/component: logging - # app.kubernetes.io/part-of: {{ $.Chart.Name }} - # app.kubernetes.io/version: {{ $.Chart.Version }} - # app.kubernetes.io/managed-by: {{ $.Release.Service }} - # spec: - # accessModes: - # - "ReadWriteOnce" - # resources: - # requests: - # storage: "5Gi" + {{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }} + - name: dashboards + emptyDir: {} + - name: provisioning-config + configMap: + name: provisioning-config + {{ end }} + - name: grafana-storage + ephemeral: + volumeClaimTemplate: + metadata: + annotations: + pv.beta.kubernetes.io/gid: "65534" + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: graphing + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + spec: + accessModes: + - "ReadWriteMany" + resources: + requests: + storage: "5Gi" serviceAccountName: grafana nodeSelector: kubernetes.io/os: linux - - # persistentVolumeClaim: - # metadata: - # labels: - # app.kubernetes.io/component: graphing - # app.kubernetes.io/instance: k8s - # app.kubernetes.io/name: grafana - # app.kubernetes.io/managed-by: {{ $.Release.Service }} - # app.kubernetes.io/part-of: {{ $.Chart.Name }} - # app.kubernetes.io/version: {{ $.Chart.Version }} - # spec: - # resources: - # requests: - # storage: "5Gi" \ No newline at end of file diff --git a/values.yaml b/values.yaml index 9ad1ef0..01fc324 100644 --- a/values.yaml +++ b/values.yaml @@ -195,6 +195,18 @@ nfc_monitoring: matchLabels: app: rook-ceph-mgr + # Add sidcar to grafana pod to load dashboards from configMap + dashboard_sidecar: + + enabled: true + + image: + name: ghcr.io/kiwigrid/k8s-sidecar + tag: '1.24.5' + + label_name: grafana_dashboard + label_value: "1" + loki_instance: image: From a26f887fa41ef0eb0ff6ade8584335e228442d6a Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 25 Sep 2023 16:38:32 +0930 Subject: [PATCH 54/61] fix(grafana): use named pvc so it's reused templated pvc as using a different name al the time. this caused left over pvc that were unused. name set so they can be reused !1 --- templates/Grafana-Grafana.yaml | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/templates/Grafana-Grafana.yaml b/templates/Grafana-Grafana.yaml index 8ff2743..db1d568 100644 --- a/templates/Grafana-Grafana.yaml +++ b/templates/Grafana-Grafana.yaml @@ -120,7 +120,7 @@ spec: volumeMounts: - mountPath: /var/lib/grafana/dashboards name: dashboards - {{ end }} + {{ end }} securityContext: fsGroup: 65534 volumes: @@ -141,23 +141,23 @@ spec: name: provisioning-config {{ end }} - name: grafana-storage - ephemeral: - volumeClaimTemplate: - metadata: - annotations: - pv.beta.kubernetes.io/gid: "65534" - labels: - app.kubernetes.io/name: grafana - app.kubernetes.io/component: graphing - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/version: {{ $.Chart.Version }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - spec: - accessModes: - - "ReadWriteMany" - resources: - requests: - storage: "5Gi" + persistentVolumeClaim: + claimName: grafana-pvc serviceAccountName: grafana nodeSelector: kubernetes.io/os: linux + + persistentVolumeClaim: + metadata: + annotations: + pv.beta.kubernetes.io/gid: "65534" + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: graphing + app.kubernetes.io/part-of: {{ $.Chart.Name }} + spec: + accessModes: + - "ReadWriteMany" + resources: + requests: + storage: "5Gi" From e3648324f234c82202999e3198ba3a493384c1e4 Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 25 Sep 2023 16:46:21 +0930 Subject: [PATCH 55/61] feat(grafana_agent): attach hostname to logs and metrics !1 --- templates/ConfigMap-GrafanaAgent.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml index 1debdec..3582f43 100644 --- a/templates/ConfigMap-GrafanaAgent.yaml +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -47,6 +47,8 @@ data: target_label: syslog_identifier - target_label: "job_name" replacement: "journal" + - target_label: hostname + replacement: "${HOSTNAME}" pipeline_stages: - json: @@ -320,6 +322,8 @@ data: target_label: __path__ - target_label: "job_name" replacement: "kubernetes_sd" + - target_label: hostname + replacement: "${HOSTNAME}" From b7cfebf92b3983f70df3439a516e0e355a505332 Mon Sep 17 00:00:00 2001 From: Jon Date: Mon, 25 Sep 2023 16:49:18 +0930 Subject: [PATCH 56/61] fix(kubestateproxy): bump proxy cpu limit !1 --- templates/Deployment-kubeStateMetrics.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/Deployment-kubeStateMetrics.yaml b/templates/Deployment-kubeStateMetrics.yaml index cc60acc..36479c3 100644 --- a/templates/Deployment-kubeStateMetrics.yaml +++ b/templates/Deployment-kubeStateMetrics.yaml @@ -70,7 +70,7 @@ spec: cpu: 100m memory: 40Mi requests: - cpu: 20m + cpu: 50m memory: 20Mi securityContext: allowPrivilegeEscalation: false From 817b83865555cbfa49dad03b2beffa2a93c45124 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 26 Sep 2023 01:30:06 +0930 Subject: [PATCH 57/61] refactor(grafana_agent): clean up config file !1 --- templates/ConfigMap-GrafanaAgent.yaml | 497 +++++++++++--------------- 1 file changed, 203 insertions(+), 294 deletions(-) diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml index 3582f43..623ea72 100644 --- a/templates/ConfigMap-GrafanaAgent.yaml +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -17,314 +17,223 @@ data: wal_directory: /tmp/wal logs: - # Choose a directory to save the last read position of log files at. - # This directory will be created if it doesn't already exist. positions_directory: "/tmp" - + configs: - - name: journal + - name: node-logs clients: - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push - scrape_configs: - - - job_name: systemd-journal - journal: - labels: - job: node-journal - path: /host/root/run/log/journal - json: true - relabel_configs: - - - source_labels: - - __journal__systemd_unit - target_label: systemd_unit - - source_labels: - - __journal__hostname - target_label: node - - source_labels: - - __journal_syslog_identifier - target_label: syslog_identifier - - target_label: "job_name" - replacement: "journal" - - target_label: hostname - replacement: "${HOSTNAME}" + backoff_config: + min_period: 10s + max_period: 5m + max_retries: 10 - pipeline_stages: - - json: - expressions: - pid: _PID - userId: _UID - application: _COMM - priority: PRIORITY + limits_config: + readline_rate_enabled: true + readline_rate: 250 + readline_burst: 750 + readline_rate_drop: false + max_streams: 500 - - labels: - application: - #level: - pid: - userId: - priority: - - template: - source: level - template: '{{"{{"}} ToLower .Value {{"}}"}}' - - match: - selector: '{priority="7"}' - stages: - - template: - source: level - template: 'debug' - - match: - selector: '{priority="6"}' - stages: - - template: - source: level - template: 'info' - - match: - selector: '{priority="5"}' - stages: - - template: - source: level - template: 'notice' - - match: - selector: '{priority="4"}' - stages: - - template: - source: level - template: 'warning' - - match: - selector: '{priority="3"}' - stages: - - template: - source: level - template: 'error' - - match: - selector: '{priority="2"}' - stages: - - template: - source: level - template: 'crit' - - match: - selector: '{priority="1"}' - stages: - - template: - source: level - template: 'alert' - - match: - selector: '{priority="0"}' - stages: - - template: - source: level - template: 'emerg' - - labels: - level: - - - - name: log-files - clients: - - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push - positions: - filename: /tmp/var-logfile.yaml - target_config: - sync_period: 10s scrape_configs: - - job_name: var-logs - static_configs: - - targets: [localhost] - labels: - job_name: 'aptitude' - __path__: /var/log/apt/*.log - - targets: [localhost] - labels: - job_name: 'dpkg' - __path__: /var/log/dpkg.log - - targets: [localhost] - labels: - job_name: 'messages' - __path__: /var/log/messages - - targets: [localhost] - labels: - job_name: 'syslog' - __path__: /var/log/syslog - - targets: [localhost] - labels: - job_name: 'kernlog' - __path__: /var/log/kern.log - - targets: [localhost] - labels: - job_name: 'auth-log' - __path__: /var/log/auth.log - - targets: [localhost] - labels: - job_name: 'boot-log' - __path__: /var/log/boot.log - - targets: [localhost] - labels: - job_name: 'daemon-log' - __path__: /var/log/daemon.log - - targets: [localhost] - labels: - job_name: 'faillog-log' - __path__: /var/log/faillog - - targets: [localhost] - labels: - job_name: 'lastlog-log' - __path__: /var/log/lastlog - - relabel_configs: - - target_label: hostname - replacement: "${HOSTNAME}" - - target_label: job - replacement: log-files - - - # - job_name: varlogs - # static_configs: - # - targets: [localhost] - # labels: - # # cluster: dev - # job: 'container logs' - # __path__: /var/log/pods/*/*/*.log - # pipeline_stages: - # - json: - # expressions: - # namespace: namespace - # level: level - # - regex: - # source: filename - # # expression: '/var/log/pods/(.+)/(?P.+)/(*.log)' - # # expression: '/var/log/pods/.+/(?P\\S+?)' - # #expression: '/var/log/pods/.+/(?P\\S+?)/*.log' - # #expression: '/var/log/pods/.+/(?P\S+?)/*.log' - # expression: '/var/log/pods/.+/(?P\S+?)/.+log' - # - regex: - # source: filename - # expression: '/var\/log\/pods\/.*_(?P\S+?)_.*\/.+\/.+log' - # - regex: - # source: filename - # expression: '/var/log/pods/(?P\S+?)_.*/.+/.+log' - # - regex: - # #source: message - # expression: '\s?level=(?P\S+)\s+' - # - regex: - # #source: message - # expression: '\s?(?Pdebug|info|warn|error)\s' - # - template: - # source: level - # template: '{{"{{"}} ToLower .Value {{"}}"}}' - # - labels: - # container: - # level: - # namespace: - # pod: - #relabel_configs: - - # - source_labels: [__filename__] - # separator: '/' - # regex: '/var/log/pods/*/(.*)/*.log' - # replacement: '${1}' - # target_label: pod - # - source_labels: [__filename__] - # replacement: '/var/log/pods/*/${1}/*.log' - # target_label: pod - # - replacement: /var/log/pods/*/${1}/*.log - # separator: / - # source_labels: - # - __meta_kubernetes_pod_uid - # - __meta_kubernetes_pod_container_name - # target_label: pod - - - - # - targets: [localhost] - # labels: - # job: varlogs - # __path__: /var/logs/* - # - targets: [localhost] - # labels: - # job: varlogs - # __path__: /host/root/var/log/* - # - targets: [localhost] - # labels: - # job: varlogs - # __path__: /host/root/var/logs/* - # - targets: [localhost] - # labels: - # job: varlogs - # __path__: /host/root/var/log/containers/* - # - targets: [localhost] - # labels: - # job: varlogs - # __path__: /host/root/var/log/*/* - - - - - - - - name: kubernetes - clients: - - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push - # basic_auth: - # username: YOUR_LOKI_USERNAME - # password: YOUR_LOKI_PASSWORD - external_labels: - #cluster: dev-cluster - positions: - filename: /tmp/kub-positions.yaml - target_config: - sync_period: 10s - scrape_configs: - - job_name: pod-logs - - kubernetes_sd_configs: - - role: pod - - pipeline_stages: - - cri: {} - - - regex: - #source: msg - expression: '(\s|\t|level=)?(?Ptrace|debug|info|warn|error|TRACE|DEBUG|INFO|WARN|ERROR)(\s|\t)' - - - labels: - level: - - relabel_configs: + - job_name: systemd-journal + journal: + path: /host/root/run/log/journal + json: true + max_age: 24h + relabel_configs: + - source_labels: - - __meta_kubernetes_pod_node_name - target_label: __host__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - # - target_label: job - # # source_labels: - # # - __meta_kubernetes_namespace - # replacement: loki/ingester_grafana-agent - - target_label: job - replacement: grafana-agent - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: replace - source_labels: - - __meta_kubernetes_pod_container_name - target_label: container - - replacement: /var/log/pods/*$1/*.log - separator: / - source_labels: - - __meta_kubernetes_pod_uid - - __meta_kubernetes_pod_container_name - target_label: __path__ + - __journal__systemd_unit + target_label: systemd_unit + - source_labels: + - __journal__hostname + target_label: node + - source_labels: + - __journal_syslog_identifier + target_label: syslog_identifier + - target_label: "job" + replacement: "systemd-journal" - target_label: "job_name" - replacement: "kubernetes_sd" + replacement: "journal-logs" - target_label: hostname replacement: "${HOSTNAME}" + pipeline_stages: + - json: + expressions: + pid: _PID + userId: _UID + application: _COMM + priority: PRIORITY + + - labels: + application: + #level: + pid: + userId: + priority: + - template: + source: level + template: '{{"{{"}} ToLower .Value {{"}}"}}' + - match: + selector: '{priority="7"}' + stages: + - template: + source: level + template: 'debug' + - match: + selector: '{priority="6"}' + stages: + - template: + source: level + template: 'info' + - match: + selector: '{priority="5"}' + stages: + - template: + source: level + template: 'notice' + - match: + selector: '{priority="4"}' + stages: + - template: + source: level + template: 'warning' + - match: + selector: '{priority="3"}' + stages: + - template: + source: level + template: 'error' + - match: + selector: '{priority="2"}' + stages: + - template: + source: level + template: 'crit' + - match: + selector: '{priority="1"}' + stages: + - template: + source: level + template: 'alert' + - match: + selector: '{priority="0"}' + stages: + - template: + source: level + template: 'emerg' + - labels: + level: + + + - job_name: var-logs + static_configs: + - targets: [localhost] + labels: + job_name: 'aptitude' + __path__: /var/log/apt/*.log + - targets: [localhost] + labels: + job_name: 'dpkg' + __path__: /var/log/dpkg.log + - targets: [localhost] + labels: + job_name: 'messages' + __path__: /var/log/messages + - targets: [localhost] + labels: + job_name: 'syslog' + __path__: /var/log/syslog + - targets: [localhost] + labels: + job_name: 'kernlog' + __path__: /var/log/kern.log + - targets: [localhost] + labels: + job_name: 'auth-log' + __path__: /var/log/auth.log + - targets: [localhost] + labels: + job_name: 'boot-log' + __path__: /var/log/boot.log + - targets: [localhost] + labels: + job_name: 'daemon-log' + __path__: /var/log/daemon.log + - targets: [localhost] + labels: + job_name: 'faillog-log' + __path__: /var/log/faillog + - targets: [localhost] + labels: + job_name: 'lastlog-log' + __path__: /var/log/lastlog + + relabel_configs: + - target_label: hostname + replacement: "${HOSTNAME}" + - target_label: job + replacement: log-files + - target_label: node + replacement: "${HOSTNAME}" + + pipeline_stages: + - timestamp: + format: RFC3339 + source: time + + + + - job_name: pod-logs + kubernetes_sd_configs: + - role: pod + + pipeline_stages: + - cri: {} + + - regex: + expression: '(\s|\t|level=)?(?Ptrace|debug|info|warn|error|TRACE|DEBUG|INFO|WARN|ERROR)(\s|\t)' + + - labels: + level: + + relabel_configs: + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - target_label: job + replacement: kubernetes_sd + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - target_label: "job_name" + replacement: "kubernetes-logs" + - target_label: hostname + replacement: "${HOSTNAME}" + - target_label: node + source_labels: + - __meta_kubernetes_pod_node_name integrations: From 899c6a3d786e147024bd63464e89637a7180b909 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 26 Sep 2023 06:25:19 +0930 Subject: [PATCH 58/61] refactor(prometheus): adjusted role/rolbinding manifest to loop !1 --- .../Role-SpecificNamespaces-prometheus.yaml | 251 +----------------- ...Binding-SpecificNamespaces-prometheus.yaml | 232 +--------------- values.yaml | 17 ++ 3 files changed, 25 insertions(+), 475 deletions(-) diff --git a/templates/Role-SpecificNamespaces-prometheus.yaml b/templates/Role-SpecificNamespaces-prometheus.yaml index 959efde..8f3447f 100644 --- a/templates/Role-SpecificNamespaces-prometheus.yaml +++ b/templates/Role-SpecificNamespaces-prometheus.yaml @@ -1,6 +1,8 @@ --- apiVersion: rbac.authorization.k8s.io/v1 items: + +{{ range .Values.nfc_monitoring.prometheus.monitor_namespaces }} - apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: @@ -12,7 +14,7 @@ items: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: default + namespace: {{ . | quote }} rules: - apiGroups: - "" @@ -40,251 +42,6 @@ items: - get - list - watch -- apiVersion: rbac.authorization.k8s.io/v1 - kind: Role - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: kube-system - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch -- apiVersion: rbac.authorization.k8s.io/v1 - kind: Role - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch - - - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: Role - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: ceph - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: Role - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.loki.namespace }} - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: Role - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: grafana - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch - - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: Role - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} - rules: - - apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch - - apiGroups: - - extensions - resources: - - ingresses - verbs: - - get - - list - - watch - - apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: - - get - - list - - watch - - - - - +{{ end }} kind: RoleList diff --git a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml index d5b4dcc..d7c5ceb 100644 --- a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml +++ b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml @@ -1,6 +1,7 @@ --- apiVersion: rbac.authorization.k8s.io/v1 items: +{{ range .Values.nfc_monitoring.prometheus.monitor_namespaces }} - apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: @@ -12,7 +13,7 @@ items: app.kubernetes.io/managed-by: {{ $.Release.Service }} app.kubernetes.io/version: {{ $.Chart.Version }} name: prometheus-k8s - namespace: default + namespace: {{ . | quote }} roleRef: apiGroup: rbac.authorization.k8s.io kind: Role @@ -20,232 +21,7 @@ items: subjects: - kind: ServiceAccount name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: kube-system - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - - - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: monitoring - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: ceph - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.loki.namespace }} - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: grafana - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - - - - - - - -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: kube-metrics - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: kube-dashboard - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: olm - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: operators - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} -- apiVersion: rbac.authorization.k8s.io/v1 - kind: RoleBinding - metadata: - labels: - app.kubernetes.io/component: prometheus - app.kubernetes.io/instance: k8s - app.kubernetes.io/name: prometheus - app.kubernetes.io/part-of: {{ $.Chart.Name }} - app.kubernetes.io/managed-by: {{ $.Release.Service }} - app.kubernetes.io/version: {{ $.Chart.Version }} - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} - roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s - subjects: - - kind: ServiceAccount - name: prometheus-k8s - namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} - - - - - - - - - - - + namespace: {{ $.Values.nfc_monitoring.prometheus.namespace }} +{{ end }} kind: RoleBindingList diff --git a/values.yaml b/values.yaml index 01fc324..f979bf4 100644 --- a/values.yaml +++ b/values.yaml @@ -135,6 +135,23 @@ nfc_monitoring: topologyKey: kubernetes.io/hostname weight: 10 + # List of namespaces that prometheus is to monitor + # used to create Roles and RoleBindings + monitor_namespaces: + - alerting + - default + # - ceph + - grafana + - monitoring + # - kube-dashboard + # - kube-metrics + - kube-policy + - kube-system + - logging + # - mariadb + # - olm + # - operators + storage: volumeClaimTemplate: spec: From c8ea92987318f10a5c14af59ac4d45bc8a549061 Mon Sep 17 00:00:00 2001 From: Jon Date: Tue, 26 Sep 2023 06:27:20 +0930 Subject: [PATCH 59/61] feat(kyverno): add clusterpolicy role and rolebinding cluster policy creates the role and rolebindings for prometheuse to monitor the ns !1 --- templates/ClusterPolicy-Prometheus-Role.yaml | 75 +++++++++++++++++++ .../ClusterPolicy-Prometheus-RoleBinding.yaml | 53 +++++++++++++ values.yaml | 10 ++- 3 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 templates/ClusterPolicy-Prometheus-Role.yaml create mode 100644 templates/ClusterPolicy-Prometheus-RoleBinding.yaml diff --git a/templates/ClusterPolicy-Prometheus-Role.yaml b/templates/ClusterPolicy-Prometheus-Role.yaml new file mode 100644 index 0000000..bf01441 --- /dev/null +++ b/templates/ClusterPolicy-Prometheus-Role.yaml @@ -0,0 +1,75 @@ +{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }} +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: add-prometheus-role + annotations: + policies.kyverno.io/title: Add Prometheus Role + policies.kyverno.io/category: Monitoring + policies.kyverno.io/subject: RoleBinding + policies.kyverno.io/minversion: 1.6.0 + policies.kyverno.io/description: >- + This policy is responsible for ensuring that a Role for the prometheus + monitoring instances is created to enable monitoring of the namespace in + question. + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + background: true + generateExisting: true + rules: + - name: generate-prometheus-role + match: + any: + - resources: + kinds: + - Namespace + generate: + synchronize: true + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + name: prometheus-k8s + namespace: "{{ `{{` }}request.object.metadata.name }}" + data: + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + +{{ end }} diff --git a/templates/ClusterPolicy-Prometheus-RoleBinding.yaml b/templates/ClusterPolicy-Prometheus-RoleBinding.yaml new file mode 100644 index 0000000..2ec1e33 --- /dev/null +++ b/templates/ClusterPolicy-Prometheus-RoleBinding.yaml @@ -0,0 +1,53 @@ +{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }} +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: add-prometheus-role-binding + annotations: + policies.kyverno.io/title: Add Prometheus RoleBinding + policies.kyverno.io/category: Monitoring + policies.kyverno.io/subject: RoleBinding + policies.kyverno.io/minversion: 1.6.0 + policies.kyverno.io/description: >- + This policy is responsible for ensuring that a RoleBinding for the prometheus + monitoring instances is created to enable monitoring of the namespace in + question. + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + background: true + generateExisting: true + rules: + - name: generate-prometheus-binding + match: + any: + - resources: + kinds: + - Namespace + generate: + synchronize: true + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: prometheus-k8s + namespace: "{{ `{{` }}request.object.metadata.name }}" + data: + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}" +{{ end }} diff --git a/values.yaml b/values.yaml index f979bf4..b7a4793 100644 --- a/values.yaml +++ b/values.yaml @@ -72,8 +72,9 @@ nfc_monitoring: name: grafana/loki tag: 2.7.4 - namespace: loki + namespace: logging + # service name and port are used for the connection to your loki instance service_name: loki-gateway service_port: 80 @@ -152,6 +153,11 @@ nfc_monitoring: # - olm # - operators + # Deploy a generate policy for kyverno to create Role and RoleBindings + # for the prometheus service account so it can monitor + # new/existing namespaces + kyverno_role_policy: true + storage: volumeClaimTemplate: spec: @@ -212,7 +218,7 @@ nfc_monitoring: matchLabels: app: rook-ceph-mgr - # Add sidcar to grafana pod to load dashboards from configMap + # Add sidecar to grafana pod to load dashboards from configMap dashboard_sidecar: enabled: true From e7480105f7d0397bb20656a1d9c623461e408a6b Mon Sep 17 00:00:00 2001 From: Jon Date: Wed, 27 Sep 2023 14:39:43 +0930 Subject: [PATCH 60/61] feat(grafana_dashboard): nfc custom, cluster overview !1 --- files/dashboard-summary.json | 1281 +++++++++++++++++ .../GrafanaDashboard-cluster-summary.yaml | 19 + 2 files changed, 1300 insertions(+) create mode 100644 files/dashboard-summary.json create mode 100644 templates/GrafanaDashboard-cluster-summary.yaml diff --git a/files/dashboard-summary.json b/files/dashboard-summary.json new file mode 100644 index 0000000..90c0a0a --- /dev/null +++ b/files/dashboard-summary.json @@ -0,0 +1,1281 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 27, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "title": "Cluster Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total cluster CPU Cores not being utilised", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total cluster CPU Cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(machine_cpu_cores)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total memory for complete cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": -1, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemAvailable_bytes{node!=\"\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total cluster CPU Cores not being utilised", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU Cores Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total memory for complete cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 4 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemAvailable_bytes{node!=\"\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "Available w/o cached", + "range": false, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "sum(node_memory_Cached_bytes{node!=\"\"})", + "hide": false, + "instant": false, + "legendFormat": "Available", + "range": true, + "refId": "B" + } + ], + "title": "Memory Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total memory for complete cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 4 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes{node!=\"\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Size", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 6, + "panels": [], + "title": "Ceph", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [], + "mappings": [ + { + "id": 0, + "options": { + "0": { + "text": "HEALTHY" + }, + "1": { + "text": "WARNING" + }, + "2": { + "text": "ERROR" + } + }, + "type": "value" + }, + { + "id": 1, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#9ac48a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 3, + "interval": "1m", + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "ceph_health_status{}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Ceph health status", + "transparent": true, + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 2, + "panels": [], + "title": "Monitoring", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 12 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum(delta(loki_log_messages_total[1m]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Incoming log messages", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs received in last 24 hours.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 12 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(delta(loki_log_messages_total[1d]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Daily log messages", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 12 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(loki_log_messages_total)", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Log Messages Stored", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many metrics per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 12 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[1m]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Incoming Metric Samples P/M", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many metrics in the last 24 hours", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 12 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[5y]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Daily Metric Samples", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many metrics are stored in the database", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 12 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(prometheus_tsdb_head_samples_appended_total[5y]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Metric Samples Stored", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "sum(delta(loki_log_messages_total[1m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Incoming", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[1m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Incoming Metric Samples P/M", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "No Fuss Computing", + "Overview", + "Kubernetes" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Cluster Overview", + "uid": "no-fuss-computing-k8s-overview", + "version": 3, + "weekStart": "" +} \ No newline at end of file diff --git a/templates/GrafanaDashboard-cluster-summary.yaml b/templates/GrafanaDashboard-cluster-summary.yaml new file mode 100644 index 0000000..2cfcf66 --- /dev/null +++ b/templates/GrafanaDashboard-cluster-summary.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cluster-summary + namespace: grafana +spec: + allowCrossNamespaceImport: false + folder: No Fuss Monitoring + resyncPeriod: 30s + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + {{ $Dashboard := .Files.Get "files/dashboard-summary.json" | fromJson }} + json: >- + {{ $Dashboard | toRawJson }} + From d4e529aec9c9d9d7a0982c061f44f853de740fd6 Mon Sep 17 00:00:00 2001 From: Jon Date: Wed, 27 Sep 2023 14:39:58 +0930 Subject: [PATCH 61/61] docs: add more features !1 --- docs/projects/kubernetes_monitoring/index.md | 81 ++++++++++++++++++-- 1 file changed, 76 insertions(+), 5 deletions(-) diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md index e81007c..ea7e20d 100644 --- a/docs/projects/kubernetes_monitoring/index.md +++ b/docs/projects/kubernetes_monitoring/index.md @@ -16,6 +16,10 @@ What this chart is: - Data visualization +- Ready for logging long term storag integration, in particular Loki + +- Ready for metrics long term storage integration, **Still to be developed, see [GL-#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1)** + What this chart is not: - A complete monitoring stack inclusive of long term storage. @@ -33,19 +37,86 @@ This helm chart started off with components from multiple open-source projects. ## Features -- Prometheus as the metrics collector - -- Grafana-Agent as the daemonset for node and node-exporter metrics and also as the logging injestor for loki +- Alert Manager - Grafana as the visualization frontend + - Dashboards + + - Ceph + + - Cluster Overview + + - Node Exporter Full + + - Alert Manager + + - Sidecar to load dashboards from ConfigMaps _Optional_ + +- Grafana-Agent + + - daemonset for cluster nodes for exposing metrics and an injecter to loki for node and container logs. + +- Loki integration for storing logs + +- [Mixins](https://github.com/monitoring-mixins/website) Compatible _(partial)_, Tested with: + + - alert-manager + + - ceph + + - coreDNS + + - Kubernetes + + - loki _Loki [helm chart](https://artifacthub.io/packages/helm/grafana/loki) included, see [loki repo](https://github.com/grafana/loki/tree/f4ab1e3e89ac66e1848764dc17826abde929fdc5/production/loki-mixin-compiled) for dashboards_ + + - Node-Exporter + + - Prometheus + + - Promtail + +- Prometheus as the metrics collector + - Service monitors - - Calico, _if Selected_ + - API Server + + - CAdvisor + + - Calico, _Optional_ > enable calico metrics with `kubectl patch felixconfiguration default --type merge --patch '{"spec":{"prometheusMetricsEnabled": true}}'` -- Customization of components through values.yaml. _See values.yaml for more details._ + - Ceph _Optional_ + + - CoreDNS + + - Grafana + + - Kube-Controler-Manager + + - Kube-Scheduler + + - Kubelet + + - Kube-state-metrics + + - Node + + - Node-Exporter + + - Prometheus + + - Prometheus-Adaptor + +- kyverno policies _(optional, set in values.yaml)_ + + - auto deploy policy for prometheus role and rolebinding to access other namespaces + +- `values.yaml` customization of components. _See values.yaml for more details._ + ## Installation