diff --git a/.cz.yaml b/.cz.yaml new file mode 100644 index 0000000..62b2ada --- /dev/null +++ b/.cz.yaml @@ -0,0 +1,7 @@ +commitizen: + bump_message: "build(version): bump version $current_version \u2192 $new_version" + changelog_incremental: false + name: cz_conventional_commits + tag_format: $major.$minor.$patch$prerelease + update_changelog_on_bump: true + version: 0.0.1 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..511c223 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,35 @@ +variables: + MY_PROJECT_ID: "50510268" + GIT_SYNC_URL: "https://$GITHUB_USERNAME_ROBOT:$GITHUB_TOKEN_ROBOT@github.com/NoFussComputing/kubernetes_monitoring.git" + + + PAGES_ENVIRONMENT_PATH: projects/kubernetes_monitoring/ + +include: + - project: nofusscomputing/projects/gitlab-ci + ref: development + file: + - template/automagic.gitlab-ci.yaml + #- template: Jobs/Container-Scanning.gitlab-ci.yml # see https://gitlab.com/gitlab-org/gitlab/-/issues/381665 + + +Website.Submodule.Deploy: + extends: .submodule_update_trigger + variables: + SUBMODULE_UPDATE_TRIGGER_PROJECT: nofusscomputing/infrastructure/website + environment: + url: https://nofusscomputing.com/$PAGES_ENVIRONMENT_PATH + name: Documentation + rules: + - if: # condition_dev_branch_push + $CI_COMMIT_BRANCH == "development" && + $CI_PIPELINE_SOURCE == "push" + exists: + - '{docs/**,pages/**}/*.md' + changes: + paths: + - '{docs/**,pages/**}/*.md' + compare_to: 'master' + when: always + + - when: never diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..de6bfdd --- /dev/null +++ b/.gitmodules @@ -0,0 +1,8 @@ +[submodule "gitlab-ci"] + path = gitlab-ci + url = https://gitlab.com/nofusscomputing/projects/gitlab-ci.git + branch = development +[submodule "website-template"] + path = website-template + url = https://gitlab.com/nofusscomputing/infrastructure/website-template.git + branch = development diff --git a/.helmignore b/.helmignore new file mode 100644 index 0000000..bda285e --- /dev/null +++ b/.helmignore @@ -0,0 +1,30 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ + +# git repo files +monitoring-mixins/ +ceph-mixins/ +gitlab-ci/ +website-template/ +docs/ diff --git a/Chart.yaml b/Chart.yaml new file mode 100644 index 0000000..8948a1e --- /dev/null +++ b/Chart.yaml @@ -0,0 +1,26 @@ +apiVersion: v2 +name: nfc-monitoring +description: | + This helm chart installs the required components for full stack monitoring. + + The purpose of this chart is to provide all components required to monitor kubernetes within its entirety. + Storage of the monitoring data is not part of this chart nor will it be included. For this reason this chart can be added and removed without loosing your monitoring data. + + For this chart to function correctly the following operators are required: + + - [grafana](https://grafana-operator.github.io/grafana-operator/docs/installation/helm/) + + - [prometheus](https://prometheus-operator.dev/docs/user-guides/getting-started/) + + Full Docs available at http://nofusscomputing.com/projects/kubernetes_monitoring. + +type: application + + +version: 0.1.0 + + +appVersion: "0.1.0" + + +dependencies: [] diff --git a/README.md b/README.md index 0732e82..cb9ad14 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,10 @@ All contributions for this project must conducted from [Gitlab](https://gitlab.c For further details on contributing please refer to the [contribution guide](CONTRIBUTING.md). +## Documentation + +Documentation for this project can be found at + ## Other This repo is release under this [license](LICENSE) diff --git a/docs/articles/index.md b/docs/articles/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/contact.md b/docs/contact.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/operations/index.md b/docs/operations/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/projects/index.md b/docs/projects/index.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/projects/kubernetes_monitoring/index.md b/docs/projects/kubernetes_monitoring/index.md new file mode 100644 index 0000000..ea7e20d --- /dev/null +++ b/docs/projects/kubernetes_monitoring/index.md @@ -0,0 +1,151 @@ +--- +title: Kubernetes Monitoring Helm Chart +description: How to use No Fuss Computings kubernetes monitoring helm chart for full stack monitoring. +date: 2023-09-19 +template: project.html +about: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring +--- + +This Helm chart deploys the components needed for full stack monitoring. + +What this chart is: + +- Collectors for metrics and logging + +- Distribution / Routing of metrics/logging + +- Data visualization + +- Ready for logging long term storag integration, in particular Loki + +- Ready for metrics long term storage integration, **Still to be developed, see [GL-#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1)** + +What this chart is not: + +- A complete monitoring stack inclusive of long term storage. + +Intentionally we have kept this helm chart to the monitoring components only. Keeping the monitoring components seperate from storage is advantageous. for all intents and purposes this helm chart is ephemeral. + +This helm chart started off with components from multiple open-source projects. As such attribution is warrented, so head on over to thier projects and give them a star. Projects used to create the building blocks of this helm chart are + +- [ceph](https://github.com/ceph/ceph) _mixin alerts/rules_ + +- [Kube prometheus](https://github.com/prometheus-operator/kube-prometheus) + +- [prometheus adaptor](https://github.com/kubernetes-sigs/prometheus-adapter) + + +## Features + +- Alert Manager + +- Grafana as the visualization frontend + + - Dashboards + + - Ceph + + - Cluster Overview + + - Node Exporter Full + + - Alert Manager + + - Sidecar to load dashboards from ConfigMaps _Optional_ + +- Grafana-Agent + + - daemonset for cluster nodes for exposing metrics and an injecter to loki for node and container logs. + +- Loki integration for storing logs + +- [Mixins](https://github.com/monitoring-mixins/website) Compatible _(partial)_, Tested with: + + - alert-manager + + - ceph + + - coreDNS + + - Kubernetes + + - loki _Loki [helm chart](https://artifacthub.io/packages/helm/grafana/loki) included, see [loki repo](https://github.com/grafana/loki/tree/f4ab1e3e89ac66e1848764dc17826abde929fdc5/production/loki-mixin-compiled) for dashboards_ + + - Node-Exporter + + - Prometheus + + - Promtail + +- Prometheus as the metrics collector + +- Service monitors + + - API Server + + - CAdvisor + + - Calico, _Optional_ + + > enable calico metrics with `kubectl patch felixconfiguration default --type merge --patch '{"spec":{"prometheusMetricsEnabled": true}}'` + + - Ceph _Optional_ + + - CoreDNS + + - Grafana + + - Kube-Controler-Manager + + - Kube-Scheduler + + - Kubelet + + - Kube-state-metrics + + - Node + + - Node-Exporter + + - Prometheus + + - Prometheus-Adaptor + +- kyverno policies _(optional, set in values.yaml)_ + + - auto deploy policy for prometheus role and rolebinding to access other namespaces + +- `values.yaml` customization of components. _See values.yaml for more details._ + + +## Installation + +This chart as the following **dependencies:** + +- [Grafana Operator](https://artifacthub.io/packages/olm/community-operators/grafana-operator) + +- [Prometheus Operator](https://artifacthub.io/packages/olm/community-operators/prometheus) + +These dependencies must be present before installing this chart. to install run the following commands: + +``` bash + +git clone -b development https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring.git + +helm upgrade -i nfc_monitoring kubernetes_monitoring/ + +``` + + +## Template Values + +The values file included in this helm chart is below. + +!!! note + This values file is from the development branch and as such may not reflect the current version you have deployed. If you require a specific version, head on over to the git repository and select the git tag that matches the version you are after. + +``` yaml title="values.yaml" linenums="1" + +--8<-- "values.yaml" + +``` diff --git a/docs/tags.md b/docs/tags.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/task-doc-template.md b/docs/task-doc-template.md new file mode 100644 index 0000000..ade9fcd --- /dev/null +++ b/docs/task-doc-template.md @@ -0,0 +1,87 @@ + + +short summary of the task file + +## {Task Name} + +- **Name**: + +- **Description**: + +- **Module**: + +- **Arguments**: + + - + +- **Conditional**: + +- **Tags**: + + - + +## {Task Name} + +- **Name**: + +- **Description**: + +- **Module**: + +- **Arguments**: + + - + +- **Registers**: + +- **Conditional**: + +- **Tags**: + + - + + +## Variables + +The following variables can be customized in this task file: + +```yaml +variable_name: "default_value" +``` + +- `variable_name`: Description of the variable. + +## Tags + +The tasks in this task file are tagged with the following tags: + +- + +## Usage + +To use this Ansible task file, you can include it in your playbook or role and provide values for the required variables. Here's an example of how you can use this task file: + +1. Create a playbook (e.g., `your_playbook.yaml`) and define the necessary variables: + +```yaml +--- + +- hosts: your_hosts + vars: + variable_name: "value" + + tasks: + - include_tasks: path/to/task_file.yaml +``` + +2. Create a separate file for the task file (e.g., `task_file.yaml`) and copy the content of the task file into it. + +3. Run the playbook: + +```shell +ansible-playbook your_playbook.yaml +``` + +Make sure to replace the placeholder values (`variable_name`, `value`) with the appropriate values for your setup. + +Note: You may need to adjust the playbook structure and additional tasks based on your specific requirements and the tasks you want to execute. \ No newline at end of file diff --git a/files/dashboard-summary.json b/files/dashboard-summary.json new file mode 100644 index 0000000..90c0a0a --- /dev/null +++ b/files/dashboard-summary.json @@ -0,0 +1,1281 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 27, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 7, + "title": "Cluster Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total cluster CPU Cores not being utilised", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total cluster CPU Cores", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(machine_cpu_cores)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total memory for complete cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": -1, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 1, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemAvailable_bytes{node!=\"\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total cluster CPU Cores not being utilised", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "hidden", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 4 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "format": "time_series", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU Cores Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total memory for complete cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 4 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemAvailable_bytes{node!=\"\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "Available w/o cached", + "range": false, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "sum(node_memory_Cached_bytes{node!=\"\"})", + "hide": false, + "instant": false, + "legendFormat": "Available", + "range": true, + "refId": "B" + } + ], + "title": "Memory Available", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "Total memory for complete cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 4 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(node_memory_MemTotal_bytes{node!=\"\"})", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Memory Size", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 6, + "panels": [], + "title": "Ceph", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [], + "mappings": [ + { + "id": 0, + "options": { + "0": { + "text": "HEALTHY" + }, + "1": { + "text": "WARNING" + }, + "2": { + "text": "ERROR" + } + }, + "type": "value" + }, + { + "id": 1, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#9ac48a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 8 + }, + "id": 3, + "interval": "1m", + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "ceph_health_status{}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Ceph health status", + "transparent": true, + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 2, + "panels": [], + "title": "Monitoring", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 12 + }, + "id": 12, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum(delta(loki_log_messages_total[1m]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Incoming log messages", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs received in last 24 hours.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 12 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(delta(loki_log_messages_total[1d]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Daily log messages", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 12 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(loki_log_messages_total)", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Log Messages Stored", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many metrics per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 12, + "y": 12 + }, + "id": 13, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[1m]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Incoming Metric Samples P/M", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many metrics in the last 24 hours", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 16, + "y": 12 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[5y]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Daily Metric Samples", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many metrics are stored in the database", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 12 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.1.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "disableTextWrap": false, + "editorMode": "code", + "exemplar": false, + "expr": "sum(increase(prometheus_tsdb_head_samples_appended_total[5y]))", + "format": "table", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A", + "useBackend": false + } + ], + "title": "Metric Samples Stored", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "sum(delta(loki_log_messages_total[1m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Incoming", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "description": "How many logs per minute are being received.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "purple", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 100, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "77e897ec-8a32-4b71-9439-8f12ec42e920" + }, + "editorMode": "code", + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[1m]))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Incoming Metric Samples P/M", + "type": "timeseries" + } + ], + "refresh": "5s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "No Fuss Computing", + "Overview", + "Kubernetes" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Cluster Overview", + "uid": "no-fuss-computing-k8s-overview", + "version": 3, + "weekStart": "" +} \ No newline at end of file diff --git a/gitlab-ci b/gitlab-ci new file mode 160000 index 0000000..a5a9fa4 --- /dev/null +++ b/gitlab-ci @@ -0,0 +1 @@ +Subproject commit a5a9fa44374107657b2587ce52607d96a825be56 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..355d4fb --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,30 @@ +INHERIT: website-template/mkdocs.yml + +docs_dir: 'docs' + +repo_name: Kubernetes Monitoring Helm Chart +repo_url: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring +edit_uri: '/-/ide/project/nofusscomputing/projects/kubernetes_monitoring/edit/development/-/docs/' + +nav: +- Home: index.md + +- Articles: + + - articles/index.md + +- Projects: + + - projects/index.md + + - Kubernetes Monitoring: + + - projects/kubernetes_monitoring/index.md + + +- Operations: + + - operations/index.md + +- Contact Us: contact.md + diff --git a/templates/APIService-prometheus-adapter.yaml b/templates/APIService-prometheus-adapter.yaml new file mode 100644 index 0000000..04d7bbf --- /dev/null +++ b/templates/APIService-prometheus-adapter.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: apiregistration.k8s.io/v1 +kind: APIService +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: v1beta1.metrics.k8s.io +spec: + group: metrics.k8s.io + groupPriorityMinimum: 100 + insecureSkipTLSVerify: true + service: + name: prometheus-adapter + namespace: monitoring + version: v1beta1 + versionPriority: 100 diff --git a/templates/AlertManager-k8s.yaml b/templates/AlertManager-k8s.yaml new file mode 100644 index 0000000..1c2f654 --- /dev/null +++ b/templates/AlertManager-k8s.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + labels: + app.kubernetes.io/instance: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}" + nodeSelector: + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + replicas: 3 + resources: + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 4m + memory: 100Mi + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: alertmanager-main + version: 0.25.0 diff --git a/templates/ClusterPolicy-Prometheus-Role.yaml b/templates/ClusterPolicy-Prometheus-Role.yaml new file mode 100644 index 0000000..bf01441 --- /dev/null +++ b/templates/ClusterPolicy-Prometheus-Role.yaml @@ -0,0 +1,75 @@ +{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }} +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: add-prometheus-role + annotations: + policies.kyverno.io/title: Add Prometheus Role + policies.kyverno.io/category: Monitoring + policies.kyverno.io/subject: RoleBinding + policies.kyverno.io/minversion: 1.6.0 + policies.kyverno.io/description: >- + This policy is responsible for ensuring that a Role for the prometheus + monitoring instances is created to enable monitoring of the namespace in + question. + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + background: true + generateExisting: true + rules: + - name: generate-prometheus-role + match: + any: + - resources: + kinds: + - Namespace + generate: + synchronize: true + apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + name: prometheus-k8s + namespace: "{{ `{{` }}request.object.metadata.name }}" + data: + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch + +{{ end }} diff --git a/templates/ClusterPolicy-Prometheus-RoleBinding.yaml b/templates/ClusterPolicy-Prometheus-RoleBinding.yaml new file mode 100644 index 0000000..2ec1e33 --- /dev/null +++ b/templates/ClusterPolicy-Prometheus-RoleBinding.yaml @@ -0,0 +1,53 @@ +{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }} +--- +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: add-prometheus-role-binding + annotations: + policies.kyverno.io/title: Add Prometheus RoleBinding + policies.kyverno.io/category: Monitoring + policies.kyverno.io/subject: RoleBinding + policies.kyverno.io/minversion: 1.6.0 + policies.kyverno.io/description: >- + This policy is responsible for ensuring that a RoleBinding for the prometheus + monitoring instances is created to enable monitoring of the namespace in + question. + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + background: true + generateExisting: true + rules: + - name: generate-prometheus-binding + match: + any: + - resources: + kinds: + - Namespace + generate: + synchronize: true + apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + name: prometheus-k8s + namespace: "{{ `{{` }}request.object.metadata.name }}" + data: + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}" +{{ end }} diff --git a/templates/ClusterRole-GrafanaAgent.yaml b/templates/ClusterRole-GrafanaAgent.yaml new file mode 100644 index 0000000..a5886ab --- /dev/null +++ b/templates/ClusterRole-GrafanaAgent.yaml @@ -0,0 +1,42 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: grafana-agent + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +rules: +- apiGroups: + - "" + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + - events + verbs: + - get + - list + - watch +- nonResourceURLs: + - /metrics + verbs: + - get +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/templates/ClusterRole-aggregated-metrics-reader.yaml b/templates/ClusterRole-aggregated-metrics-reader.yaml new file mode 100644 index 0000000..bc2afd0 --- /dev/null +++ b/templates/ClusterRole-aggregated-metrics-reader.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" + rbac.authorization.k8s.io/aggregate-to-view: "true" + name: system:aggregated-metrics-reader + namespace: monitoring +rules: +- apiGroups: + - metrics.k8s.io + resources: + - pods + - nodes + verbs: + - get + - list + - watch diff --git a/templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml b/templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml new file mode 100644 index 0000000..5648792 --- /dev/null +++ b/templates/ClusterRole-binding-delegator-prometheus-adaptor.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: resource-metrics:system:auth-delegator + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml b/templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml new file mode 100644 index 0000000..4785870 --- /dev/null +++ b/templates/ClusterRole-binding-hpa-custom-metrics-prometheus-adaptor.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: hpa-controller-custom-metrics + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: custom-metrics-server-resources +subjects: +- kind: ServiceAccount + name: horizontal-pod-autoscaler + namespace: kube-system diff --git a/templates/ClusterRole-binding-prometheus.yaml b/templates/ClusterRole-binding-prometheus.yaml new file mode 100644 index 0000000..4d6bc54 --- /dev/null +++ b/templates/ClusterRole-binding-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-adapter +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/templates/ClusterRole-grafana-SideCar.yaml b/templates/ClusterRole-grafana-SideCar.yaml new file mode 100644 index 0000000..42bf948 --- /dev/null +++ b/templates/ClusterRole-grafana-SideCar.yaml @@ -0,0 +1,18 @@ +{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: Grafana-Sidecar +rules: + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "watch", "list"] +{{- end }} diff --git a/templates/ClusterRole-kube-monitor-proxy.yaml b/templates/ClusterRole-kube-monitor-proxy.yaml new file mode 100644 index 0000000..e7b364a --- /dev/null +++ b/templates/ClusterRole-kube-monitor-proxy.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-monitor-proxy + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +rules: +- apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] diff --git a/templates/ClusterRole-kubeStateMetrics-1.yaml b/templates/ClusterRole-kubeStateMetrics-1.yaml new file mode 100644 index 0000000..be81244 --- /dev/null +++ b/templates/ClusterRole-kubeStateMetrics-1.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/templates/ClusterRole-kubeStateMetrics.yaml b/templates/ClusterRole-kubeStateMetrics.yaml new file mode 100644 index 0000000..15e72ba --- /dev/null +++ b/templates/ClusterRole-kubeStateMetrics.yaml @@ -0,0 +1,132 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - serviceaccounts + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch +- apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch +- apiGroups: + - discovery.k8s.io + resources: + - endpointslices + verbs: + - list + - watch +- apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch +- apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + - ingressclasses + - ingresses + verbs: + - list + - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + - rolebindings + - roles + verbs: + - list + - watch diff --git a/templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml b/templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml new file mode 100644 index 0000000..b5d028b --- /dev/null +++ b/templates/ClusterRole-metrics-server-resources-prometheus-adaptor.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: resource-metrics-server-resources +rules: +- apiGroups: + - metrics.k8s.io + resources: + - '*' + verbs: + - '*' diff --git a/templates/ClusterRole-prometheus-adapter.yaml b/templates/ClusterRole-prometheus-adapter.yaml new file mode 100644 index 0000000..87cd65f --- /dev/null +++ b/templates/ClusterRole-prometheus-adapter.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter +rules: +- apiGroups: + - "" + resources: + - nodes + - namespaces + - pods + - services + verbs: + - get + - list + - watch diff --git a/templates/ClusterRole-prometheus.yaml b/templates/ClusterRole-prometheus.yaml new file mode 100644 index 0000000..151fe77 --- /dev/null +++ b/templates/ClusterRole-prometheus.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s +rules: +- apiGroups: + - "" + resources: + - nodes/metrics + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/templates/ClusterRoleBinding-Grafana-Agent.yaml b/templates/ClusterRoleBinding-Grafana-Agent.yaml new file mode 100644 index 0000000..403db42 --- /dev/null +++ b/templates/ClusterRoleBinding-Grafana-Agent.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: grafana-agent +subjects: +- kind: ServiceAccount + name: grafana-agent + namespace: monitoring diff --git a/templates/ClusterRoleBinding-Grafana-SideCar.yaml b/templates/ClusterRoleBinding-Grafana-SideCar.yaml new file mode 100644 index 0000000..367f863 --- /dev/null +++ b/templates/ClusterRoleBinding-Grafana-SideCar.yaml @@ -0,0 +1,22 @@ +{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}} +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: Grafana-Sidecar +roleRef: + kind: ClusterRole + name: Grafana-Sidecar + apiGroup: rbac.authorization.k8s.io +subjects: + - kind: ServiceAccount + name: grafana + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" +{{- end }} diff --git a/templates/ClusterRoleBinding-kube-monitor-proxy.yaml b/templates/ClusterRoleBinding-kube-monitor-proxy.yaml new file mode 100644 index 0000000..0e4c1dd --- /dev/null +++ b/templates/ClusterRoleBinding-kube-monitor-proxy.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-monitor-proxy + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-monitor-proxy +subjects: +- kind: ServiceAccount + name: kube-monitor-proxy + namespace: monitoring diff --git a/templates/ClusterRoleBinding-prometheus.yaml b/templates/ClusterRoleBinding-prometheus.yaml new file mode 100644 index 0000000..b54fa91 --- /dev/null +++ b/templates/ClusterRoleBinding-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/ConfigMap-Grafana.yaml b/templates/ConfigMap-Grafana.yaml new file mode 100644 index 0000000..eeb28f4 --- /dev/null +++ b/templates/ConfigMap-Grafana.yaml @@ -0,0 +1,130 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-config + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +data: + prometheus.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access":"proxy", + "editable": true, + "name": "prometheus", + "orgId": 1, + "type": "prometheus", + "url": "http://prometheus-config-map.monitoring.svc:9090", + "version": 1 + } + ] + } + + oncall-app.yaml: |- + apiVersion: 1 + apps: + - type: grafana-oncall-app + name: grafana-oncall-app + disabled: false + jsonData: + #user: admin + grafanaUrl: http://grafana.monitoring.svc:3000 + onCallApiUrl: http://oncall.monitoring.svc:8080 + secureJsonData: + #password: admin + onCallInvitationToken: yaD3a6dkYV0CFinTlMPb42Mi9 + #http://127.0.0.1:8080/integrations/v1/alertmanager/yaD3a6dkYV0CFinTlMPb42Mi9/ + + +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana-alertmanager + namespace: monitoring +data: + alertmanager.yaml: |- + { + "template_files": {}, + "template_file_provenances": {}, + "alertmanager_config": { + "route": { + "receiver": "Grafana Alerting 😊", + "group_by": [ + "grafana_folder", + "alertname" + ], + "routes": [ + { + "receiver": "Grafana Alerting 😊", + "object_matchers": [ + [ + "severity", + "!=", + "" + ], + [ + "severity", + "=", + "critical" + ], + [ + "severity", + "=", + "info" + ], + [ + "severity", + "=", + "warning" + ] + ], + "continue": true, + "group_wait": "1s", + "group_interval": "2s", + "repeat_interval": "15m" + } + ], + "group_wait": "1s", + "group_interval": "2s", + "repeat_interval": "3m" + }, + "templates": null, + "receivers": [ + { + "name": "Grafana Alerting 😊", + "grafana_managed_receiver_configs": [ + { + "uid": "4dqvJSfVkz", + "name": "Grafana Alerting 😊", + "type": "webhook", + "disableResolveMessage": false, + "settings": { + "httpMethod": "POST", + "url": "http://host.docker.internal:8080/integrations/v1/grafana_alerting/dXStYoK9Z15VZW8R8AtfyIwtu/" + }, + "secureFields": {} + } + ] + } + ] + } + } + +--- \ No newline at end of file diff --git a/templates/ConfigMap-GrafanaAgent.yaml b/templates/ConfigMap-GrafanaAgent.yaml new file mode 100644 index 0000000..623ea72 --- /dev/null +++ b/templates/ConfigMap-GrafanaAgent.yaml @@ -0,0 +1,284 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana-agent + namespace: monitoring +data: + agent.yaml: | + metrics: + wal_directory: /tmp/wal + + logs: + positions_directory: "/tmp" + + configs: + + - name: node-logs + clients: + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push + backoff_config: + min_period: 10s + max_period: 5m + max_retries: 10 + + limits_config: + readline_rate_enabled: true + readline_rate: 250 + readline_burst: 750 + readline_rate_drop: false + max_streams: 500 + + scrape_configs: + + - job_name: systemd-journal + journal: + path: /host/root/run/log/journal + json: true + max_age: 24h + relabel_configs: + + - source_labels: + - __journal__systemd_unit + target_label: systemd_unit + - source_labels: + - __journal__hostname + target_label: node + - source_labels: + - __journal_syslog_identifier + target_label: syslog_identifier + - target_label: "job" + replacement: "systemd-journal" + - target_label: "job_name" + replacement: "journal-logs" + - target_label: hostname + replacement: "${HOSTNAME}" + + pipeline_stages: + - json: + expressions: + pid: _PID + userId: _UID + application: _COMM + priority: PRIORITY + + - labels: + application: + #level: + pid: + userId: + priority: + - template: + source: level + template: '{{"{{"}} ToLower .Value {{"}}"}}' + - match: + selector: '{priority="7"}' + stages: + - template: + source: level + template: 'debug' + - match: + selector: '{priority="6"}' + stages: + - template: + source: level + template: 'info' + - match: + selector: '{priority="5"}' + stages: + - template: + source: level + template: 'notice' + - match: + selector: '{priority="4"}' + stages: + - template: + source: level + template: 'warning' + - match: + selector: '{priority="3"}' + stages: + - template: + source: level + template: 'error' + - match: + selector: '{priority="2"}' + stages: + - template: + source: level + template: 'crit' + - match: + selector: '{priority="1"}' + stages: + - template: + source: level + template: 'alert' + - match: + selector: '{priority="0"}' + stages: + - template: + source: level + template: 'emerg' + - labels: + level: + + + - job_name: var-logs + static_configs: + - targets: [localhost] + labels: + job_name: 'aptitude' + __path__: /var/log/apt/*.log + - targets: [localhost] + labels: + job_name: 'dpkg' + __path__: /var/log/dpkg.log + - targets: [localhost] + labels: + job_name: 'messages' + __path__: /var/log/messages + - targets: [localhost] + labels: + job_name: 'syslog' + __path__: /var/log/syslog + - targets: [localhost] + labels: + job_name: 'kernlog' + __path__: /var/log/kern.log + - targets: [localhost] + labels: + job_name: 'auth-log' + __path__: /var/log/auth.log + - targets: [localhost] + labels: + job_name: 'boot-log' + __path__: /var/log/boot.log + - targets: [localhost] + labels: + job_name: 'daemon-log' + __path__: /var/log/daemon.log + - targets: [localhost] + labels: + job_name: 'faillog-log' + __path__: /var/log/faillog + - targets: [localhost] + labels: + job_name: 'lastlog-log' + __path__: /var/log/lastlog + + relabel_configs: + - target_label: hostname + replacement: "${HOSTNAME}" + - target_label: job + replacement: log-files + - target_label: node + replacement: "${HOSTNAME}" + + pipeline_stages: + - timestamp: + format: RFC3339 + source: time + + + + - job_name: pod-logs + kubernetes_sd_configs: + - role: pod + + pipeline_stages: + - cri: {} + + - regex: + expression: '(\s|\t|level=)?(?Ptrace|debug|info|warn|error|TRACE|DEBUG|INFO|WARN|ERROR)(\s|\t)' + + - labels: + level: + + relabel_configs: + - source_labels: + - __meta_kubernetes_pod_node_name + target_label: __host__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - target_label: job + replacement: kubernetes_sd + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: replace + source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + - target_label: "job_name" + replacement: "kubernetes-logs" + - target_label: hostname + replacement: "${HOSTNAME}" + - target_label: node + source_labels: + - __meta_kubernetes_pod_node_name + + + integrations: + + node_exporter: + enabled: true + rootfs_path: /host/root + sysfs_path: /host/sys + procfs_path: /host/proc + udev_data_path: /host/root/run/udev/data + + # collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/) + filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)" + #filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$" + filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$" + + + netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$" + netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$" + + scrape_integration: true + + include_exporter_metrics: true + enable_collectors: + - uname + + + syslog_server.yaml: | + # REF: https://grafana.com/docs/loki/latest/send-data/promtail/configuration/#example-syslog-config + server: + http_listen_port: 9080 + grpc_listen_port: 0 + + positions: + filename: /tmp/syslog_server_positions.yaml + + clients: + - url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push + + scrape_configs: + - job_name: syslog + syslog: + listen_address: 0.0.0.0:1514 + labels: + job: "syslog" + relabel_configs: + - source_labels: ['__syslog_message_hostname'] + target_label: 'host' diff --git a/templates/ConfigMap-GrafanaProvisioning.yaml b/templates/ConfigMap-GrafanaProvisioning.yaml new file mode 100644 index 0000000..6b35943 --- /dev/null +++ b/templates/ConfigMap-GrafanaProvisioning.yaml @@ -0,0 +1,31 @@ +{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}} +--- +# Provisioning config +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: provisioning-config + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" +data: + provisioning.yaml: |- + apiVersion: 1 + providers: + - name: 'configmap-dashboard-provider' + orgId: 1 + folder: '' + folderUid: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: true +{{- end }} diff --git a/templates/ConfigMap-prometheus-adapter.yaml b/templates/ConfigMap-prometheus-adapter.yaml new file mode 100644 index 0000000..2f1a140 --- /dev/null +++ b/templates/ConfigMap-prometheus-adapter.yaml @@ -0,0 +1,72 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: adapter-config + namespace: monitoring +data: + config.yaml: |- + "resourceRules": + "cpu": + "containerLabel": "container" + "containerQuery": | + sum by (<<.GroupBy>>) ( + irate ( + container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s] + ) + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + 1 - irate( + node_cpu_seconds_total{mode="idle"}[60s] + ) + * on(namespace, pod) group_left(node) ( + node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>} + ) + ) + or sum by (<<.GroupBy>>) ( + 1 - irate( + windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m] + ) + ) + "resources": + "overrides": + "namespace": + "resource": "namespace" + "node": + "resource": "node" + "pod": + "resource": "pod" + "memory": + "containerLabel": "container" + "containerQuery": | + sum by (<<.GroupBy>>) ( + container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""} + ) + "nodeQuery": | + sum by (<<.GroupBy>>) ( + node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} + - + node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>} + ) + or sum by (<<.GroupBy>>) ( + windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>} + - + windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>} + ) + "resources": + "overrides": + "instance": + "resource": "node" + "namespace": + "resource": "namespace" + "pod": + "resource": "pod" + "window": "5m" diff --git a/templates/Daemonset-GrafanaAgent.yaml b/templates/Daemonset-GrafanaAgent.yaml new file mode 100644 index 0000000..6a77656 --- /dev/null +++ b/templates/Daemonset-GrafanaAgent.yaml @@ -0,0 +1,138 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + metricsJob: node-exporter + cadvisormetricsJob: cadvisor + nodeExportermetricsJob: node + name: grafana-agent + namespace: "{{ .Values.nfc_monitoring.grafana_agent.namespace }}" +spec: + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + metricsJob: node-exporter + cadvisormetricsJob: cadvisor + nodeExportermetricsJob: node + template: + metadata: + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + metricsJob: node-exporter + cadvisormetricsJob: cadvisor + nodeExportermetricsJob: node + spec: + automountServiceAccountToken: true + containers: + - args: + - --server.http.address=0.0.0.0:12345 + - --config.file=/etc/agent/agent.yaml + - --config.expand-env=true + name: grafana-agent + image: "{{ .Values.nfc_monitoring.grafana_agent.image.name }}:{{ .Values.nfc_monitoring.grafana_agent.image.tag }}" + #imagePullPolicy: Never + ports: + - containerPort: 12345 + name: grafana-metrics + protocol: TCP + resources: + limits: + cpu: 1000m + memory: 180Mi + requests: + cpu: 40m + memory: 180Mi + securityContext: + capabilities: + add: + - SYS_TIME + # drop: + # - ALL + readOnlyRootFilesystem: false + privileged: true + volumeMounts: + - mountPath: /host/sys + mountPropagation: HostToContainer + name: sys + readOnly: true + - mountPath: /host/proc + mountPropagation: HostToContainer + name: proc + readOnly: true + - mountPath: /host/root + mountPropagation: HostToContainer + name: rootfs + readOnly: true + - mountPath: /var/log + mountPropagation: HostToContainer + name: logs + readOnly: true + - name: config + mountPath: "/etc/agent" + readOnly: false + - name: temp + mountPath: "/tmp" + readOnly: false + - name: agent-data + mountPath: "/etc/agent/data" + readOnly: false + dnsPolicy: ClusterFirst + volumes: + - hostPath: + path: /sys + name: sys + - hostPath: + path: /proc + name: proc + - hostPath: + path: / + name: rootfs + - hostPath: + path: /var/log + name: logs + - name: config + configMap: + name: grafana-agent + items: + - key: "agent.yaml" + path: "agent.yaml" + - name: temp + emptyDir: {} + - name: agent-data + emptyDir: {} + + - name: var-run + hostPath: + path: /var/run + - name: containerd + hostPath: + path: /var/lib/contairnerd + - name: disk + hostPath: + path: /dev/disk + + nodeSelector: + kubernetes.io/os: linux + hostNetwork: true + hostPID: true + priorityClassName: system-cluster-critical + serviceAccountName: grafana-agent + tolerations: + - operator: Exists diff --git a/templates/Daemonset-kube-monitor-proxy.yaml b/templates/Daemonset-kube-monitor-proxy.yaml new file mode 100644 index 0000000..a283196 --- /dev/null +++ b/templates/Daemonset-kube-monitor-proxy.yaml @@ -0,0 +1,136 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + + nodeExportermetricsJob: node + name_kcm: kube-controller-manager + + name: kube-monitor-proxy + namespace: "{{ .Values.nfc_monitoring.kube_monitor_proxy.namespace }}" +spec: + selector: + matchLabels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + template: + metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + automountServiceAccountToken: true + hostNetwork: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + containers: + + - args: + - --logtostderr + - --v=10 + - --secure-listen-address=[$(IP)]:11257 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=https://127.0.0.1:10257/ + - --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + env: + - name: NODE-IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}" + name: kube-rbac-proxy-kube-ctrl-mgr + ports: + - containerPort: 10257 + #hostPort: 9100 + name: kube-ctrl-mgr + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + + + - args: + - --logtostderr + - --v=10 + - --secure-listen-address=[$(IP)]:11259 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=https://127.0.0.1:10259/ + - --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + - --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt + env: + - name: NODE-IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}" + name: kube-rbac-proxy-kube-scheduler + ports: + - containerPort: 10259 + #hostPort: 9100 + name: kube-scheduler + resources: + limits: + cpu: 200m + memory: 256Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + hostPID: true + priorityClassName: system-cluster-critical + serviceAccountName: kube-monitor-proxy + tolerations: + - operator: Exists diff --git a/templates/Deployment-kubeStateMetrics.yaml b/templates/Deployment-kubeStateMetrics.yaml new file mode 100644 index 0000000..36479c3 --- /dev/null +++ b/templates/Deployment-kubeStateMetrics.yaml @@ -0,0 +1,112 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: "{{ .Values.nfc_monitoring.kube_state_metrics.namespace }}" +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: kube-state-metrics + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + automountServiceAccountToken: true + containers: + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + image: "{{ .Values.nfc_monitoring.kube_state_metrics.image.name }}:{{ .Values.nfc_monitoring.kube_state_metrics.image.tag }}" + name: kube-state-metrics + resources: + limits: + cpu: 200m + memory: 250Mi + requests: + cpu: 10m + memory: 190Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 65534 + - args: + - --logtostderr + - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8081/ + image: quay.io/brancz/kube-rbac-proxy:v0.14.0 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 100m + memory: 40Mi + requests: + cpu: 50m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + - args: + - --logtostderr + - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8082/ + image: quay.io/brancz/kube-rbac-proxy:v0.14.0 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: kube-state-metrics diff --git a/templates/Deployment-prometheus-adapter.yaml b/templates/Deployment-prometheus-adapter.yaml new file mode 100644 index 0000000..3cbab58 --- /dev/null +++ b/templates/Deployment-prometheus-adapter.yaml @@ -0,0 +1,102 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: "{{ .Values.nfc_monitoring.prometheus_adaptor.namespace }}" +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + affinity: + {{- toYaml .Values.nfc_monitoring.prometheus_adaptor.affinity | nindent 8 }} + automountServiceAccountToken: true + containers: + - args: + - --cert-dir=/var/run/serving-cert + - --config=/etc/adapter/config.yaml + - --metrics-relist-interval=1m + - --prometheus-url=https://prometheus.monitoring.svc:9090/ + - --secure-port=6443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA + image: "{{ .Values.nfc_monitoring.prometheus_adaptor.image.name }}:{{ .Values.nfc_monitoring.prometheus_adaptor.image.tag }}" + livenessProbe: + failureThreshold: 5 + httpGet: + path: /livez + port: https + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + name: prometheus-adapter + ports: + - containerPort: 6443 + name: https + readinessProbe: + failureThreshold: 5 + httpGet: + path: /readyz + port: https + scheme: HTTPS + initialDelaySeconds: 30 + periodSeconds: 5 + resources: + requests: + cpu: 102m + memory: 180Mi + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /tmp + name: tmpfs + readOnly: false + - mountPath: /var/run/serving-cert + name: volume-serving-cert + readOnly: false + - mountPath: /etc/adapter + name: config + readOnly: false + nodeSelector: + kubernetes.io/os: linux + securityContext: {} + serviceAccountName: prometheus-adapter + volumes: + - emptyDir: {} + name: tmpfs + - emptyDir: {} + name: volume-serving-cert + - configMap: + name: adapter-config + name: config diff --git a/templates/Grafana-Grafana.yaml b/templates/Grafana-Grafana.yaml new file mode 100644 index 0000000..db1d568 --- /dev/null +++ b/templates/Grafana-Grafana.yaml @@ -0,0 +1,163 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" +spec: + config: + log: + mode: "console" + auth: + disable_login_form: "false" + security: + admin_user: "{{ .Values.nfc_monitoring.grafana.admin_user }}" + admin_password: "{{ .Values.nfc_monitoring.grafana.admin_password }}" + deployment: + metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + spec: + replicas: {{ .Values.nfc_monitoring.grafana.replicas | int }} + selector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + + spec: + affinity: + {{- toYaml .Values.nfc_monitoring.grafana.affinity | nindent 12 }} + {{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }} + automountServiceAccountToken: true + {{ end }} + containers: + - name: grafana + image: "{{ .Values.nfc_monitoring.grafana.image.name }}:{{ .Values.nfc_monitoring.grafana.image.tag }}" + env: + - name: GF_INSTALL_PLUGINS + value: agenty-flowcharting-panel 0.9.1, ddurieux-glpi-app 1.3.1, grafana-oncall-app + - name: GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS + value: grafana-oncall-app 1.1.38 + - name: GF_ANALYTICS_REPORTING_ENABLED + value: 'false' + imagePullPolicy: IfNotPresent + ports: + - containerPort: 3000 + name: grafana-http + readinessProbe: + failureThreshold: 3 + httpGet: + path: /login + port: 3000 + scheme: HTTP + initialDelaySeconds: 30 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 3 + resources: + limits: + cpu: 2000m + memory: 1Gi + requests: + cpu: 100m + memory: 100Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false + volumeMounts: + - mountPath: /var/lib/grafana + name: grafana-storage + - mountPath: /etc/grafana/provisioning/plugins + name: plugin-config + readOnly: false + {{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }} + - mountPath: /etc/grafana/provisioning/dashboards + name: provisioning-config + - mountPath: /var/lib/grafana/dashboards + name: dashboards + + - image: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.name }}:{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.tag}}" + name: k8s-sidecar + env: + - name: LABEL + value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_name }}" + - name: LABEL_VALUE + value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_value }}" + - name: FOLDER + value: /var/lib/grafana/dashboards + - name: NAMESPACE + value: grafana + - name: RESOURCE + value: configmap + volumeMounts: + - mountPath: /var/lib/grafana/dashboards + name: dashboards + {{ end }} + securityContext: + fsGroup: 65534 + volumes: + #- name: grafana-storage + - name: plugin-config + configMap: + # Provide the name of the ConfigMap you want to mount. + name: grafana-config + # An array of keys from the ConfigMap to create as files + items: + - key: "oncall-app.yaml" + path: "oncall-app.yaml" + {{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }} + - name: dashboards + emptyDir: {} + - name: provisioning-config + configMap: + name: provisioning-config + {{ end }} + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana-pvc + serviceAccountName: grafana + nodeSelector: + kubernetes.io/os: linux + + persistentVolumeClaim: + metadata: + annotations: + pv.beta.kubernetes.io/gid: "65534" + labels: + app.kubernetes.io/name: grafana + app.kubernetes.io/component: graphing + app.kubernetes.io/part-of: {{ $.Chart.Name }} + spec: + accessModes: + - "ReadWriteMany" + resources: + requests: + storage: "5Gi" diff --git a/templates/GrafanaDashboard-AlertManager.yaml b/templates/GrafanaDashboard-AlertManager.yaml new file mode 100644 index 0000000..cc9ef9e --- /dev/null +++ b/templates/GrafanaDashboard-AlertManager.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: alertmanager + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + allowCrossNamespaceImport: true + folder: No Fuss Monitoring + resyncPeriod: 1d + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + grafanaCom: + id: 9578 + revision: 4 # as @ 19-09-23 \ No newline at end of file diff --git a/templates/GrafanaDashboard-Ceph.yaml b/templates/GrafanaDashboard-Ceph.yaml new file mode 100644 index 0000000..38d735d --- /dev/null +++ b/templates/GrafanaDashboard-Ceph.yaml @@ -0,0 +1,21 @@ +--- +{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}} +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: ceph + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + allowCrossNamespaceImport: true + folder: No Fuss Monitoring + resyncPeriod: 1d + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + grafanaCom: + id: 2842 + revision: 17 # as @ 19-09-23 + +{{- end -}} \ No newline at end of file diff --git a/templates/GrafanaDashboard-cluster-summary.yaml b/templates/GrafanaDashboard-cluster-summary.yaml new file mode 100644 index 0000000..2cfcf66 --- /dev/null +++ b/templates/GrafanaDashboard-cluster-summary.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cluster-summary + namespace: grafana +spec: + allowCrossNamespaceImport: false + folder: No Fuss Monitoring + resyncPeriod: 30s + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + {{ $Dashboard := .Files.Get "files/dashboard-summary.json" | fromJson }} + json: >- + {{ $Dashboard | toRawJson }} + diff --git a/templates/GrafanaDashboard-node-exporter-full.yaml b/templates/GrafanaDashboard-node-exporter-full.yaml new file mode 100644 index 0000000..b5b9abe --- /dev/null +++ b/templates/GrafanaDashboard-node-exporter-full.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: node-exporter + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + allowCrossNamespaceImport: true + folder: No Fuss Monitoring + resyncPeriod: 1d + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + grafanaCom: + id: 1860 + revision: 32 # as @ 19-09-23 \ No newline at end of file diff --git a/templates/GrafanaDatasource-AlertManager.yaml b/templates/GrafanaDatasource-AlertManager.yaml new file mode 100644 index 0000000..b954b84 --- /dev/null +++ b/templates/GrafanaDatasource-AlertManager.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: alertmanager + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: alertmanager + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + #secrets: + # - credentials + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + allowCrossNamespaceImport: true + datasource: + name: alertmanager + type: alertmanager + access: proxy + # basicAuth: true + url: "http://alertmanager-main.{{ .Values.nfc_monitoring.alert_manager.namespace }}.svc:9093" + isDefault: false + # user: admin + jsonData: + "tlsSkipVerify": true + "timeInterval": "5s" + "implementation": prometheus + "handleGrafanaManagedAlerts": false + # "orgId": 1 + # secureJsonData: + # "password": admin + editable: true \ No newline at end of file diff --git a/templates/GrafanaDatasource-Prometheus.yaml b/templates/GrafanaDatasource-Prometheus.yaml new file mode 100644 index 0000000..3a44401 --- /dev/null +++ b/templates/GrafanaDatasource-Prometheus.yaml @@ -0,0 +1,38 @@ +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: prometheus + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + #secrets: + # - credentials + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + allowCrossNamespaceImport: true + datasource: + name: prometheus + type: prometheus + access: proxy + # basicAuth: true + url: "http://prometheus-k8s.{{ .Values.nfc_monitoring.prometheus.namespace }}.svc:9090" + isDefault: true + # user: admin + jsonData: + # "tlsSkipVerify": true + manageAlerts: true + "timeInterval": "5s" + "orgId": 1 + # secureJsonData: + # "password": admin + editable: true \ No newline at end of file diff --git a/templates/GrafanaDatasource-loki.yaml b/templates/GrafanaDatasource-loki.yaml new file mode 100644 index 0000000..0b2dfc7 --- /dev/null +++ b/templates/GrafanaDatasource-loki.yaml @@ -0,0 +1,41 @@ +--- +{{- if .Values.nfc_monitoring.loki.enabled | default false -}} + +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: loki + namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}" + labels: + aapp.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: loki + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + #secrets: + # - credentials + instanceSelector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + allowCrossNamespaceImport: true + datasource: + name: loki + type: loki + access: proxy + # basicAuth: true + url: "http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}" + isDefault: false + # user: admin + jsonData: + # "tlsSkipVerify": true + "timeInterval": "5s" + "orgId": 1 + # secureJsonData: + # "password": admin + editable: true + +{{- end -}} \ No newline at end of file diff --git a/templates/NetworkPolicy-kubeStateMetrics.yaml b/templates/NetworkPolicy-kubeStateMetrics.yaml new file mode 100644 index 0000000..a9b774f --- /dev/null +++ b/templates/NetworkPolicy-kubeStateMetrics.yaml @@ -0,0 +1,31 @@ +# apiVersion: networking.k8s.io/v1 +# kind: NetworkPolicy +# metadata: +# labels: +# app.kubernetes.io/component: exporter +# app.kubernetes.io/name: kube-state-metrics +# app.kubernetes.io/part-of: kube-prometheus +# app.kubernetes.io/version: 2.8.1 +# name: kube-state-metrics +# namespace: monitoring +# spec: +# egress: +# - {} +# ingress: +# - from: +# - podSelector: +# matchLabels: +# app.kubernetes.io/name: prometheus +# ports: +# - port: 8443 +# protocol: TCP +# - port: 9443 +# protocol: TCP +# podSelector: +# matchLabels: +# app.kubernetes.io/component: exporter +# app.kubernetes.io/name: kube-state-metrics +# app.kubernetes.io/part-of: kube-prometheus +# policyTypes: +# - Egress +# - Ingress diff --git a/templates/PodDisruptionBudget-alertmanager.yaml b/templates/PodDisruptionBudget-alertmanager.yaml new file mode 100644 index 0000000..43ed6fe --- /dev/null +++ b/templates/PodDisruptionBudget-alertmanager.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + maxUnavailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/PodDisruptionBudget-prometheus-adapter.yaml b/templates/PodDisruptionBudget-prometheus-adapter.yaml new file mode 100644 index 0000000..17203c2 --- /dev/null +++ b/templates/PodDisruptionBudget-prometheus-adapter.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/PodDisruptionBudget-prometheus.yaml b/templates/PodDisruptionBudget-prometheus.yaml new file mode 100644 index 0000000..7a5374d --- /dev/null +++ b/templates/PodDisruptionBudget-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + minAvailable: 1 + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/Prometheus-prometheus.yaml b/templates/Prometheus-prometheus.yaml new file mode 100644 index 0000000..66dacd0 --- /dev/null +++ b/templates/Prometheus-prometheus.yaml @@ -0,0 +1,54 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: k8s + namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}" +spec: + affinity: + {{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }} + alerting: + alertmanagers: + - apiVersion: v2 + name: alertmanager-main + namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + port: web + enableFeatures: [] + externalLabels: {} + image: quay.io/prometheus/prometheus:v2.42.0 + nodeSelector: + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + podMonitorNamespaceSelector: {} + podMonitorSelector: {} + probeNamespaceSelector: {} + probeSelector: {} + replicas: 3 + resources: + requests: + memory: 400Mi + ruleNamespaceSelector: {} + ruleSelector: {} + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: prometheus-k8s + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + storage: + {{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }} + version: 2.42.0 diff --git a/templates/PrometheusRule-alertmanager.yaml b/templates/PrometheusRule-alertmanager.yaml new file mode 100644 index 0000000..95ef2a1 --- /dev/null +++ b/templates/PrometheusRule-alertmanager.yaml @@ -0,0 +1,141 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: alertmanager-main-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/templates/PrometheusRule-grafana-agent.yaml b/templates/PrometheusRule-grafana-agent.yaml new file mode 100644 index 0000000..640733c --- /dev/null +++ b/templates/PrometheusRule-grafana-agent.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: grafana-agent-promtail + name: grafana-agent + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: grafana_agent + rules: + # - annotations: + # description: "As Grafana Agent is being used, it's version is set as promtails" + - expr: | + agent_build_info + record: promtail_build_info diff --git a/templates/PrometheusRule-grafana.yaml b/templates/PrometheusRule-grafana.yaml new file mode 100644 index 0000000..b505e52 --- /dev/null +++ b/templates/PrometheusRule-grafana.yaml @@ -0,0 +1,35 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: grafana-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + annotations: + message: '{{`{{`}} $labels.namespace }}/{{`{{`}} $labels.job }}/{{`{{`}} $labels.handler }} is experiencing {{`{{`}} $value | humanize }}% errors' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing + expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > 50 + for: 5m + labels: + severity: warning + - name: grafana_rules + rules: + - expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m diff --git a/templates/PrometheusRule-kubePrometheus.yaml b/templates/PrometheusRule-kubePrometheus.yaml new file mode 100644 index 0000000..268491b --- /dev/null +++ b/templates/PrometheusRule-kubePrometheus.yaml @@ -0,0 +1,86 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: kube-prometheus-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ `{{` }} printf "%.4g" $value }}% of the {{ `{{` }} $labels.job }}/{{ `{{` }} $labels.service }} targets in {{ `{{` }} $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + severity: none + - alert: InfoInhibitor + annotations: + description: | + This is an alert that is used to inhibit info alerts. + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with + other alerts. + This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a + severity of 'warning' or 'critical' starts firing on the same namespace. + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor + summary: Info-level alert inhibition. + expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ `{{` }} $labels.device }}" changing its up status often on node-exporter {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/templates/PrometheusRule-kubeStateMetrics.yaml b/templates/PrometheusRule-kubeStateMetrics.yaml new file mode 100644 index 0000000..6df5352 --- /dev/null +++ b/templates/PrometheusRule-kubeStateMetrics.yaml @@ -0,0 +1,67 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: kube-state-metrics-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: kube-state-metrics + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors + summary: kube-state-metrics is experiencing errors in list operations. + expr: | + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors + summary: kube-state-metrics is experiencing errors in watch operations. + expr: | + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: | + stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: | + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1 + - + sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) + != 0 + for: 15m + labels: + severity: critical diff --git a/templates/PrometheusRule-kubernetesControlPlane.yaml b/templates/PrometheusRule-kubernetesControlPlane.yaml new file mode 100644 index 0000000..6845600 --- /dev/null +++ b/templates/PrometheusRule-kubernetesControlPlane.yaml @@ -0,0 +1,1441 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: kubernetes-monitoring-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: 'Pod {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} ({{ `{{` }} $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod, cluster) ( + max by(namespace, pod, cluster) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown|Failed"} + ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( + 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_deployment_spec_replicas{job="kube-state-metrics"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.statefulset }} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | + ( + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.daemonset }} has not finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: pod/{{ `{{` }} $labels.pod }} in namespace {{ `{{` }} $labels.namespace }} on container {{ `{{` }} $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ `{{` }} $value }} Pods of DaemonSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.daemonset }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ `{{` }} $value }} Pods of DaemonSet {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobNotCompleted + annotations: + description: Job {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.job_name }} is taking more than {{ `{{` }} "43200" | humanizeDuration }} to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted + summary: Job did not complete in time + expr: | + time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} + and + kube_job_status_active{job="kube-state-metrics"} > 0) > 43200 + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched desired number of replicas. + expr: | + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods by {{ `{{` }} $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + and + (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods by {{ `{{` }} $value | humanize }} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + and + (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ `{{` }} $labels.namespace }} is using {{ `{{` }} $value | humanizePercentage }} of its {{ `{{` }} $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ `{{` }} $labels.namespace }} is using {{ `{{` }} $value | humanizePercentage }} of its {{ `{{` }} $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ `{{` }} $labels.namespace }} is using {{ `{{` }} $value | humanizePercentage }} of its {{ `{{` }} $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} throttling of CPU in namespace {{ `{{` }} $labels.namespace }} for container {{ `{{` }} $labels.container }} in pod {{ `{{` }} $labels.pod }}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} is only {{ `{{` }} $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} is expected to fill up within four days. Currently {{ `{{` }} $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: The PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} only has {{ `{{` }} $value | humanizePercentage }} free inodes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: | + ( + kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{job="kubelet", metrics_path="/metrics"} > 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ `{{` }} $labels.persistentvolumeclaim }} in Namespace {{ `{{` }} $labels.namespace }} is expected to run out of inodes within four days. Currently {{ `{{` }} $value | humanizePercentage }} of its inodes are free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: | + ( + kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{job="kubelet", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on(namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ `{{` }} $labels.persistentvolume }} has status {{ `{{` }} $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeVersionMismatch + annotations: + description: There are {{ `{{` }} $value }} different semantic versions of Kubernetes components running. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. + expr: | + count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ `{{` }} $labels.job }}/{{ `{{` }} $labels.instance }}' is experiencing {{ `{{` }} $value | humanizePercentage }} errors.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) + / + sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + for: 5m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration + summary: Client certificate is about to expire. + expr: | + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + for: 5m + labels: + severity: critical + - alert: KubeAggregatedAPIErrors + annotations: + description: Kubernetes aggregated API {{ `{{` }} $labels.name }}/{{ `{{` }} $labels.namespace }} has reported errors. It has appeared unavailable {{ `{{` }} $value | humanize }} times averaged over the past 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors + summary: Kubernetes aggregated API has reported errors. + expr: | + sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4 + labels: + severity: warning + - alert: KubeAggregatedAPIDown + annotations: + description: Kubernetes aggregated API {{ `{{` }} $labels.name }}/{{ `{{` }} $labels.namespace }} has been only {{ `{{` }} $value | humanize }}% available over the last 10m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown + summary: Kubernetes aggregated API is down. + expr: | + (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning + - alert: KubeAPIDown + annotations: + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPITerminatedRequests + annotations: + description: The kubernetes apiserver has terminated {{ `{{` }} $value | humanizePercentage }} of its incoming requests. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests + summary: The kubernetes apiserver has terminated {{ `{{` }} $value | humanizePercentage }} of its incoming requests. + expr: | + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady + annotations: + description: '{{ `{{` }} $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready + summary: Node is not ready. + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning + - alert: KubeNodeUnreachable + annotations: + description: '{{ `{{` }} $labels.node }} is unreachable and some workloads may be rescheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable + summary: Node is unreachable. + expr: | + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + description: Kubelet '{{ `{{` }} $labels.node }}' is running at {{ `{{` }} $value | humanizePercentage }} of its Pod capacity. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods + summary: Kubelet is running at capacity. + expr: | + count by(cluster, node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(cluster, node) ( + kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + ) > 0.95 + for: 15m + labels: + severity: info + - alert: KubeNodeReadinessFlapping + annotations: + description: The readiness status of node {{ `{{` }} $labels.node }} has changed {{ `{{` }} $value }} times in the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping + summary: Node readiness status is flapping. + expr: | + sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + for: 15m + labels: + severity: warning + - alert: KubeletPlegDurationHigh + annotations: + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ `{{` }} $value }} seconds on node {{ `{{` }} $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. + expr: | + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning + - alert: KubeletPodStartUpLatencyHigh + annotations: + description: Kubelet Pod startup 99th percentile latency is {{ `{{` }} $value }} seconds on node {{ `{{` }} $labels.node }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. + expr: | + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletClientCertificateExpiration + annotations: + description: Client certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. + expr: | + kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning + - alert: KubeletServerCertificateExpiration + annotations: + description: Server certificate for Kubelet on node {{ `{{` }} $labels.node }} expires in {{ `{{` }} $value | humanizeDuration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. + expr: | + kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical + - alert: KubeletClientCertificateRenewalErrors + annotations: + description: Kubelet on node {{ `{{` }} $labels.node }} has failed to renew its client certificate ({{ `{{` }} $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. + expr: | + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletServerCertificateRenewalErrors + annotations: + description: Kubelet on node {{ `{{` }} $labels.node }} has failed to renew its server certificate ({{ `{{` }} $value | humanize }} errors in the last 5 minutes). + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. + expr: | + increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: KubeletDown + annotations: + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown + annotations: + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-system-controller-manager + rules: + - alert: KubeControllerManagerDown + annotations: + description: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - name: kube-apiserver-burnrate.rules + rules: + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + - + ( + ( + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) + or + vector(0) + ) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + + + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) + ) + ) + + + # errors + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + - + sum by (cluster) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) + ) + + + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h + - name: kube-apiserver-histogram.rules + rules: + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + labels: + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules + rules: + - expr: | + avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d + - expr: | + sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h])) + record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h + - expr: | + sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d + - expr: | + sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h + - expr: | + sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d + - expr: | + 1 - ( + ( + # write too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + ) + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum by (cluster) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by (cluster) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - name: k8s.rules + rules: + - expr: | + sum by (cluster, namespace, pod, container) ( + irate(container_cpu_usage_seconds_total{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate + - expr: | + container_memory_working_set_bytes{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="cadvisor", metrics_path="/metrics/cadvisor", image!=""} + * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, + max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: | + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum + - expr: | + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum + - expr: | + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) + group_left() max by (namespace, pod, cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - expr: | + sum by (namespace, cluster) ( + sum by (namespace, pod, cluster) ( + max by (namespace, pod, container, cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules + rules: + - expr: | + topk by(cluster, namespace, pod) (1, + max by (cluster, node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (cluster, node) ( + node_cpu_seconds_total{mode="idle",job="node-exporter"} + * on (namespace, pod) group_left(node) + topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:) + ) + record: node:node_num_cpu:sum + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - expr: | + avg by (cluster, node) ( + sum without (mode) ( + rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) + ) + ) + record: node:node_cpu_utilization:ratio_rate5m + - expr: | + avg by (cluster) ( + node:node_cpu_utilization:ratio_rate5m + ) + record: cluster:node_cpu:ratio_rate5m + - name: kubelet.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/templates/PrometheusRule-loki.yaml b/templates/PrometheusRule-loki.yaml new file mode 100644 index 0000000..13a424f --- /dev/null +++ b/templates/PrometheusRule-loki.yaml @@ -0,0 +1,112 @@ +--- +{{- if .Values.nfc_monitoring.loki.enabled | default false -}} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: logging + app.kubernetes.io/name: loki + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: loki + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + message: | + {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: | + {{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: | + {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + message: | + {{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: 5m + labels: + severity: warning + +{{- end -}} diff --git a/templates/PrometheusRule-nodeExporter.yaml b/templates/PrometheusRule-nodeExporter.yaml new file mode 100644 index 0000000..24bfbe2 --- /dev/null +++ b/templates/PrometheusRule-nodeExporter.yaml @@ -0,0 +1,318 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: node-exporter-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: | + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: | + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: | + rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: | + rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ `{{` }} $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: | + (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: | + node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock on {{ `{{` }} $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: | + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock on {{ `{{` }} $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: | + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ `{{` }} $labels.device }}' on {{ `{{` }} $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded + expr: | + node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array on {{ `{{` }} $labels.instance }} failed. Array '{{ `{{` }} $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array + expr: | + node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: | + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - name: node-exporter.rules + rules: + - expr: | + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum + - expr: | + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + record: instance:node_cpu_utilisation:rate5m + - expr: | + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio + - expr: | + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio + - expr: | + rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m + - expr: | + rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m + - expr: | + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m + - expr: | + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m diff --git a/templates/PrometheusRule-prometheus.yaml b/templates/PrometheusRule-prometheus.yaml new file mode 100644 index 0000000..28e9b36 --- /dev/null +++ b/templates/PrometheusRule-prometheus.yaml @@ -0,0 +1,281 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: prometheus-k8s-prometheus-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: prometheus + rules: + - alert: PrometheusBadConfig + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to reload its configuration. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig + summary: Failed Prometheus configuration reload. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Alert notification queue of Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is running full. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull + summary: Prometheus alert notification queue predicted to run full in less than 30m. + expr: | + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers + annotations: + description: '{{ `{{` }} printf "%.1f" $value }}% errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to Alertmanager {{ `{{` }}$labels.alertmanager}}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers + summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + expr: | + ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + * 100 + > 1 + for: 15m + labels: + severity: warning + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not connected to any Alertmanagers. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers + summary: Prometheus is not connected to any Alertmanagers. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} reload failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing + summary: Prometheus has issues reloading blocks from disk. + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} compaction failures over the last 3h. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing + summary: Prometheus has issues compacting blocks. + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not ingesting samples. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples + summary: Prometheus is not ingesting samples. + expr: | + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) + for: 10m + labels: + severity: warning + - alert: PrometheusDuplicateTimestamps + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps + summary: Prometheus is dropping samples with duplicate timestamps. + expr: | + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusOutOfOrderTimestamps + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with timestamps arriving out of order. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps + summary: Prometheus drops samples with out-of-order timestamps. + expr: | + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 10m + labels: + severity: warning + - alert: PrometheusRemoteStorageFailures + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} failed to send {{ `{{` }} printf "%.1f" $value }}% of the samples to {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures + summary: Prometheus fails to send samples to remote storage. + expr: | + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + / + ( + (rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + + + (rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])) + ) + ) + * 100 + > 1 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteBehind + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write is {{ `{{` }} printf "%.1f" $value }}s behind for {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind + summary: Prometheus remote write is behind. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + > 120 + for: 15m + labels: + severity: critical + - alert: PrometheusRemoteWriteDesiredShards + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write desired shards calculation wants to run {{ `{{` }} $value }} shards for queue {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}, which is more than the max of {{ `{{` }} printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards + summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) + > + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) + ) + for: 15m + labels: + severity: warning + - alert: PrometheusRuleFailures + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to evaluate {{ `{{` }} printf "%.0f" $value }} rules in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures + summary: Prometheus is failing rule evaluations. + expr: | + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: critical + - alert: PrometheusMissingRuleEvaluations + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has missed {{ `{{` }} printf "%.0f" $value }} rule group evaluations in the last 5m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations + summary: Prometheus is missing rule evaluations due to slow rule group evaluation. + expr: | + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusLabelLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit + summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusScrapeBodySizeLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit + summary: Prometheus has dropped some targets that exceeded body size limit. + expr: | + increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusScrapeSampleLimitHit + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit + summary: Prometheus has failed scrapes that have exceeded the configured sample limit. + expr: | + increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusTargetSyncFailure + annotations: + description: '{{ `{{` }} printf "%.0f" $value }} targets in Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} have failed to sync because invalid configuration was supplied.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure + summary: Prometheus has failed to sync targets. + expr: | + increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0 + for: 5m + labels: + severity: critical + - alert: PrometheusHighQueryLoad + annotations: + description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload + summary: Prometheus is reaching its maximum capacity serving concurrent requests. + expr: | + avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.8 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ `{{` }} printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to any Alertmanager.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical diff --git a/templates/Role-SpecificNamespaces-prometheus.yaml b/templates/Role-SpecificNamespaces-prometheus.yaml new file mode 100644 index 0000000..8f3447f --- /dev/null +++ b/templates/Role-SpecificNamespaces-prometheus.yaml @@ -0,0 +1,47 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +items: + +{{ range .Values.nfc_monitoring.prometheus.monitor_namespaces }} +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ . | quote }} + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch + - apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +{{ end }} + +kind: RoleList diff --git a/templates/RoleBinding-Config-prometheus.yaml b/templates/RoleBinding-Config-prometheus.yaml new file mode 100644 index 0000000..918804f --- /dev/null +++ b/templates/RoleBinding-Config-prometheus.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s-config + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s-config +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/RoleBinding-SpecificNamespaces-prometheus.yaml b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml new file mode 100644 index 0000000..d7c5ceb --- /dev/null +++ b/templates/RoleBinding-SpecificNamespaces-prometheus.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +items: +{{ range .Values.nfc_monitoring.prometheus.monitor_namespaces }} +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ . | quote }} + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: {{ $.Values.nfc_monitoring.prometheus.namespace }} +{{ end }} + +kind: RoleBindingList diff --git a/templates/RoleBinding-prometheus-adapter-auth-reader.yaml b/templates/RoleBinding-prometheus-adapter-auth-reader.yaml new file mode 100644 index 0000000..be46bee --- /dev/null +++ b/templates/RoleBinding-prometheus-adapter-auth-reader.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/version: 0.11.1 + name: resource-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/templates/RoleConfig-prometheus.yaml b/templates/RoleConfig-prometheus.yaml new file mode 100644 index 0000000..3628555 --- /dev/null +++ b/templates/RoleConfig-prometheus.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s-config + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get diff --git a/templates/Secret-alertmanager.yaml b/templates/Secret-alertmanager.yaml new file mode 100644 index 0000000..4e710be --- /dev/null +++ b/templates/Secret-alertmanager.yaml @@ -0,0 +1,61 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +stringData: + alertmanager.yaml: |- + "global": + "resolve_timeout": "5m" + "inhibit_rules": + - "equal": + - "namespace" + - "alertname" + "source_matchers": + - "severity = critical" + "target_matchers": + - "severity =~ warning|info" + - "equal": + - "namespace" + - "alertname" + "source_matchers": + - "severity = warning" + "target_matchers": + - "severity = info" + - "equal": + - "namespace" + "source_matchers": + - "alertname = InfoInhibitor" + "target_matchers": + - "severity = info" + "receivers": + - "name": "Default" + - "name": "Watchdog" + - "name": "Critical" + - "name": "null" + "route": + "group_by": + - "namespace" + "group_interval": "5m" + "group_wait": "30s" + "receiver": "Default" + "repeat_interval": "12h" + "routes": + - "matchers": + - "alertname = Watchdog" + "receiver": "Watchdog" + - "matchers": + - "alertname = InfoInhibitor" + "receiver": "null" + - "matchers": + - "severity = critical" + "receiver": "Critical" +type: Opaque diff --git a/templates/Service-CalicoMetrics.yaml b/templates/Service-CalicoMetrics.yaml new file mode 100644 index 0000000..e81c30b --- /dev/null +++ b/templates/Service-CalicoMetrics.yaml @@ -0,0 +1,20 @@ +--- +{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}} + +apiVersion: v1 +kind: Service +metadata: + name: calico-metrics + namespace: kube-system + labels: + k8s-app: calico-node +spec: + clusterIP: None + selector: + k8s-app: calico-node + ports: + - port: 9091 + name: metrics + targetPort: 9091 + +{{- end -}} diff --git a/templates/Service-Grafana.yaml b/templates/Service-Grafana.yaml new file mode 100644 index 0000000..019adc1 --- /dev/null +++ b/templates/Service-Grafana.yaml @@ -0,0 +1,30 @@ +--- + +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + selector: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} + #type: NodePort + #type: LoadBalancer + #clusterIP: None + ports: + - name: grafana + port: 3000 + targetPort: grafana-http + #nodePort: 3000 + #type: LoadBalancer + sessionAffinity: ClientIP diff --git a/templates/Service-GrafanaAgent.yaml b/templates/Service-GrafanaAgent.yaml new file mode 100644 index 0000000..fc070ab --- /dev/null +++ b/templates/Service-GrafanaAgent.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana-agent + namespace: monitoring + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} +spec: + selector: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} + ports: + - name: grafana-metrics + port: 12345 + targetPort: grafana-metrics + - name: kube-ctrl-mgr + port: 11257 + targetPort: kube-ctrl-mgr + #type: LoadBalancer + sessionAffinity: ClientIP diff --git a/templates/Service-alertmanager.yaml b/templates/Service-alertmanager.yaml new file mode 100644 index 0000000..974e77e --- /dev/null +++ b/templates/Service-alertmanager.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} +spec: + ports: + - name: web + port: 9093 + targetPort: web + - name: reloader-web + port: 8080 + targetPort: reloader-web + selector: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + sessionAffinity: ClientIP diff --git a/templates/Service-kube-monitor-proxy.yaml b/templates/Service-kube-monitor-proxy.yaml new file mode 100644 index 0000000..a23d9dd --- /dev/null +++ b/templates/Service-kube-monitor-proxy.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-monitor-proxy + namespace: monitoring + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name_kcm: kube-controller-manager + name_ks: kube-scheduler +spec: + selector: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/part-of: {{ $.Chart.Name }} + + ports: + - name: kube-ctrl-mgr + port: 10257 + targetPort: kube-ctrl-mgr + - name: kube-scheduler + port: 10259 + targetPort: kube-scheduler + sessionAffinity: ClientIP diff --git a/templates/Service-kubeStateMetrics.yaml b/templates/Service-kubeStateMetrics.yaml new file mode 100644 index 0000000..6e4e7b2 --- /dev/null +++ b/templates/Service-kubeStateMetrics.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https-main + port: 8443 + targetPort: https-main + - name: https-self + port: 9443 + targetPort: https-self + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: {{ $.Chart.Name }} + diff --git a/templates/Service-prometheus-adapter.yaml b/templates/Service-prometheus-adapter.yaml new file mode 100644 index 0000000..406fd70 --- /dev/null +++ b/templates/Service-prometheus-adapter.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +spec: + ports: + - name: https + port: 443 + targetPort: 6443 + selector: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/Service-prometheus.yaml b/templates/Service-prometheus.yaml new file mode 100644 index 0000000..c44eef8 --- /dev/null +++ b/templates/Service-prometheus.yaml @@ -0,0 +1,27 @@ +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + ports: + - name: web + port: 9090 + targetPort: web + - name: reloader-web + port: 8080 + targetPort: reloader-web + selector: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + sessionAffinity: ClientIP diff --git a/templates/ServiceAccount-Grafana.yaml b/templates/ServiceAccount-Grafana.yaml new file mode 100644 index 0000000..41eeb9e --- /dev/null +++ b/templates/ServiceAccount-Grafana.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + name: grafana + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} diff --git a/templates/ServiceAccount-GrafanaAgent.yaml b/templates/ServiceAccount-GrafanaAgent.yaml new file mode 100644 index 0000000..32363be --- /dev/null +++ b/templates/ServiceAccount-GrafanaAgent.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + name: grafana-agent + namespace: monitoring + labels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} diff --git a/templates/ServiceAccount-alertmanager.yaml b/templates/ServiceAccount-alertmanager.yaml new file mode 100644 index 0000000..f8c73b3 --- /dev/null +++ b/templates/ServiceAccount-alertmanager.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager-main + namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }} diff --git a/templates/ServiceAccount-kube-monitor-proxy.yaml b/templates/ServiceAccount-kube-monitor-proxy.yaml new file mode 100644 index 0000000..b8e258f --- /dev/null +++ b/templates/ServiceAccount-kube-monitor-proxy.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-monitor-proxy + namespace: monitoring + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} diff --git a/templates/ServiceAccount-kubeStateMetrics.yaml b/templates/ServiceAccount-kubeStateMetrics.yaml new file mode 100644 index 0000000..c4f59ba --- /dev/null +++ b/templates/ServiceAccount-kubeStateMetrics.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: monitoring diff --git a/templates/ServiceAccount-prometheus-adapter.yaml b/templates/ServiceAccount-prometheus-adapter.yaml new file mode 100644 index 0000000..ff50c47 --- /dev/null +++ b/templates/ServiceAccount-prometheus-adapter.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: false +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring diff --git a/templates/ServiceAccount-prometheus.yaml b/templates/ServiceAccount-prometheus.yaml new file mode 100644 index 0000000..871071f --- /dev/null +++ b/templates/ServiceAccount-prometheus.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +automountServiceAccountToken: true +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} diff --git a/templates/ServiceMonitor-APIServer.yaml b/templates/ServiceMonitor-APIServer.yaml new file mode 100644 index 0000000..7d423d9 --- /dev/null +++ b/templates/ServiceMonitor-APIServer.yaml @@ -0,0 +1,78 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: apiserver + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: kube-apiserver + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|server).* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) + sourceLabels: + - __name__ + - le + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes diff --git a/templates/ServiceMonitor-Cadvisor.yaml b/templates/ServiceMonitor-Cadvisor.yaml new file mode 100644 index 0000000..1e8be30 --- /dev/null +++ b/templates/ServiceMonitor-Cadvisor.yaml @@ -0,0 +1,52 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: cadvisor + app.kubernetes.io/component: exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: cadvisor + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + honorTimestamps: false + interval: 30s + metricRelabelings: + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ + - action: drop + regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);; + sourceLabels: + - __name__ + - pod + - namespace + - action: drop + regex: (container_blkio_device_usage_total);.+ + sourceLabels: + - __name__ + - container + path: /metrics/cadvisor + port: https-metrics + #jobName: cadvisor + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - targetLabel: "job" + replacement: "cadvisor" + scheme: https + tlsConfig: + insecureSkipVerify: true + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet diff --git a/templates/ServiceMonitor-Calico.yaml b/templates/ServiceMonitor-Calico.yaml new file mode 100644 index 0000000..d40a74e --- /dev/null +++ b/templates/ServiceMonitor-Calico.yaml @@ -0,0 +1,36 @@ +--- +{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: calico + app.kubernetes.io/component: networking + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + k8s-app: calico-node + name: calico + namespace: kube-system +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: metrics + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: http + # tlsConfig: + # insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + k8s-app: calico-node + +{{- end -}} diff --git a/templates/ServiceMonitor-CoreDNS.yaml b/templates/ServiceMonitor-CoreDNS.yaml new file mode 100644 index 0000000..5c96bb3 --- /dev/null +++ b/templates/ServiceMonitor-CoreDNS.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: coredns + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: coredns + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + metricRelabelings: + - action: drop + regex: coredns_cache_misses_total + sourceLabels: + - __name__ + port: metrics + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-dns diff --git a/templates/ServiceMonitor-Grafana.yaml b/templates/ServiceMonitor-Grafana.yaml new file mode 100644 index 0000000..8a7bdb9 --- /dev/null +++ b/templates/ServiceMonitor-Grafana.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: grafana + namespace: {{ .Values.nfc_monitoring.grafana.namespace }} +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: grafana + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_instance + targetLabel: instance + scheme: http + # tlsConfig: + # insecureSkipVerify: true + targetLabels: + - cluster + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: graphing + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-Kubelet.yaml b/templates/ServiceMonitor-Kubelet.yaml new file mode 100644 index 0000000..b921127 --- /dev/null +++ b/templates/ServiceMonitor-Kubelet.yaml @@ -0,0 +1,87 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: kubelet + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_endpoint_address_target_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/probes + port: https-metrics + relabelings: + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_endpoint_address_target_name + targetLabel: instance + scheme: https + tlsConfig: + insecureSkipVerify: true + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app.kubernetes.io/name: kubelet diff --git a/templates/ServiceMonitor-Node.yaml b/templates/ServiceMonitor-Node.yaml new file mode 100644 index 0000000..e1d32c3 --- /dev/null +++ b/templates/ServiceMonitor-Node.yaml @@ -0,0 +1,43 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + #cluster: dev + name: node + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 5s + honorLabels: true + path: /metrics + port: grafana-metrics + scheme: http + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + - targetLabel: "job" + replacement: "node-exporter" + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: node + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-ceph.yaml b/templates/ServiceMonitor-ceph.yaml new file mode 100644 index 0000000..b57b31b --- /dev/null +++ b/templates/ServiceMonitor-ceph.yaml @@ -0,0 +1,39 @@ +--- +{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}} + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/name: ceph + app.kubernetes.io/component: storage + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: ceph + namespace: "{{ .Values.nfc_monitoring.additions.ceph.namespace }}" +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: http-metrics + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_part_of + targetLabel: instance + - sourceLabels: + - __metrics_path__ + targetLabel: metrics_path + - targetLabel: "job" + replacement: "ceph" + scheme: http + targetLabels: + - cluster + selector: +{{ .Values.nfc_monitoring.additions.ceph.ServiceMonitor.selector | toYaml | indent 4 }} + + +{{- end -}} diff --git a/templates/ServiceMonitor-kube-controller-manager.yaml b/templates/ServiceMonitor-kube-controller-manager.yaml new file mode 100644 index 0000000..21d30f2 --- /dev/null +++ b/templates/ServiceMonitor-kube-controller-manager.yaml @@ -0,0 +1,82 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-controller-manager + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name_kcm: kube-controller-manager + name: kube-controller-manager + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 10s + honorLabels: true + path: /metrics + port: kube-ctrl-mgr + scheme: https + # labels: + # job: kube-controller-manager + relabelings: + # - action: replace + # regex: (.*) + # replacement: $1 + # sourceLabels: + # - __meta_kubernetes_pod_node_name + # targetLabel: instance + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_service_label_name_kcm + targetLabel: job + metricRelabelings: + - action: drop + regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) + sourceLabels: + - __name__ + - action: drop + regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) + sourceLabels: + - __name__ + - action: drop + regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers) + sourceLabels: + - __name__ + - action: drop + regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) + sourceLabels: + - __name__ + - action: drop + regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) + sourceLabels: + - __name__ + - action: drop + regex: transformation_(transformation_latencies_microseconds|failures_total) + sourceLabels: + - __name__ + - action: drop + regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count) + sourceLabels: + - __name__ + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-kube-scheduler.yaml b/templates/ServiceMonitor-kube-scheduler.yaml new file mode 100644 index 0000000..71149ce --- /dev/null +++ b/templates/ServiceMonitor-kube-scheduler.yaml @@ -0,0 +1,36 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: proxy + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-scheduler + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-scheduler + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 10s + honorLabels: true + path: /metrics + port: kube-scheduler + scheme: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_service_label_name_ks + targetLabel: job + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-monitor-proxy + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-kubeStateMetrics.yaml b/templates/ServiceMonitor-kubeStateMetrics.yaml new file mode 100644 index 0000000..20ca550 --- /dev/null +++ b/templates/ServiceMonitor-kubeStateMetrics.yaml @@ -0,0 +1,46 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: kube-state-metrics + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + metricRelabelings: + - action: drop + regex: kube_endpoint_address_not_ready|kube_endpoint_address_available + sourceLabels: + - __name__ + port: https-main + relabelings: + - action: labeldrop + regex: (pod|service|endpoint|namespace) + scheme: https + scrapeTimeout: 30s + tlsConfig: + insecureSkipVerify: true + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-self + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: app.kubernetes.io/name + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-node-exporter.yaml b/templates/ServiceMonitor-node-exporter.yaml new file mode 100644 index 0000000..9620458 --- /dev/null +++ b/templates/ServiceMonitor-node-exporter.yaml @@ -0,0 +1,44 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: node-exporter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + name: node-exporter + namespace: monitoring +spec: + endpoints: + + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 5s + honorLabels: true + path: /integrations/node_exporter/metrics + port: grafana-metrics + scheme: http + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + - targetLabel: "job" + replacement: "node-exporter" + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: node + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/instance: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: grafana-agent + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-prometheus-adapter.yaml b/templates/ServiceMonitor-prometheus-adapter.yaml new file mode 100644 index 0000000..0270e70 --- /dev/null +++ b/templates/ServiceMonitor-prometheus-adapter.yaml @@ -0,0 +1,39 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-adapter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*) + sourceLabels: + - __name__ + port: https + scheme: https + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_instance + targetLabel: instance + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: metrics-adapter + app.kubernetes.io/instance: main + app.kubernetes.io/name: prometheus-adapter + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/ServiceMonitor-prometheus.yaml b/templates/ServiceMonitor-prometheus.yaml new file mode 100644 index 0000000..e17d8fb --- /dev/null +++ b/templates/ServiceMonitor-prometheus.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: prometheus-k8s + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + endpoints: + - interval: 30s + port: web + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_name + targetLabel: instance + - interval: 30s + port: reloader-web + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/templates/prometheusRule-ceph.yaml b/templates/prometheusRule-ceph.yaml new file mode 100644 index 0000000..e6a9a28 --- /dev/null +++ b/templates/prometheusRule-ceph.yaml @@ -0,0 +1,688 @@ +--- +{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}} + +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: storage + app.kubernetes.io/name: ceph + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + prometheus: k8s + role: alert-rules + name: ceph-rules + namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} +spec: + groups: + - name: "cluster health" + rules: + - alert: "CephHealthError" + annotations: + description: "The cluster state has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information." + summary: "Ceph is in the ERROR state" + expr: "ceph_health_status == 2" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.2.1" + severity: "critical" + type: "ceph_default" + - alert: "CephHealthWarning" + annotations: + description: "The cluster state has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information." + summary: "Ceph is in the WARNING state" + expr: "ceph_health_status == 1" + for: "15m" + labels: + severity: "warning" + type: "ceph_default" + - name: "mon" + rules: + - alert: "CephMonDownQuorumAtRisk" + annotations: + description: "{{ `{{` }} $min := query \"floor(count(ceph_mon_metadata) / 2) + 1\" | first | value }}Quorum requires a majority of monitors (x {{ `{{` }} $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{ `{{` }}- range query \"(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)\" }} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down" + summary: "Monitor quorum is at risk" + expr: | + ( + (ceph_health_detail{name="MON_DOWN"} == 1) * on() ( + count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1) + ) + ) == 1 + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.3.1" + severity: "critical" + type: "ceph_default" + - alert: "CephMonDown" + annotations: + description: | + {{ `{{` }} $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ `{{` }} $s := "" }}{{ `{{` }} if gt $down 1.0 }}{{ `{{` }} $s = "s" }}{{ `{{` }} end }}You have {{ `{{` }} $down }} monitor{{ `{{` }} $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{ `{{` }}- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }} + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down" + summary: "One or more monitors down" + expr: | + count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1) + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephMonDiskspaceCritical" + annotations: + description: "The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{ `{{` }}- range query \"ceph_mon_metadata\"}} - {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit" + summary: "Filesystem space on at least one monitor is critically low" + expr: "ceph_health_detail{name=\"MON_DISK_CRIT\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.3.2" + severity: "critical" + type: "ceph_default" + - alert: "CephMonDiskspaceLow" + annotations: + description: "The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{ `{{` }}- range query \"ceph_mon_metadata\"}} - {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low" + summary: "Drive space on at least one monitor is approaching full" + expr: "ceph_health_detail{name=\"MON_DISK_LOW\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephMonClockSkew" + annotations: + description: "Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew" + summary: "Clock skew detected among monitors" + expr: "ceph_health_detail{name=\"MON_CLOCK_SKEW\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - name: "osd" + rules: + - alert: "CephOSDDownHigh" + annotations: + description: "{{ `{{` }} $value | humanize }}% or {{ `{{` }} with query \"count(ceph_osd_up == 0)\" }}{{ `{{` }} . | first | value }}{{ `{{` }} end }} of {{ `{{` }} with query \"count(ceph_osd_up)\" }}{{ `{{` }} . | first | value }}{{ `{{` }} end }} OSDs are down (>= 10%). The following OSDs are down: {{ `{{` }}- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }}" + summary: "More than 10% of OSDs are down" + expr: "count(ceph_osd_up == 0) / count(ceph_osd_up) * 100 >= 10" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.1" + severity: "critical" + type: "ceph_default" + - alert: "CephOSDHostDown" + annotations: + description: "The following OSDs are down: {{ `{{` }}- range query \"(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0\" }} - {{ `{{` }} .Labels.hostname }} : {{ `{{` }} .Labels.ceph_daemon }} {{ `{{` }}- end }}" + summary: "An OSD host is offline" + expr: "ceph_health_detail{name=\"OSD_HOST_DOWN\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.8" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDDown" + annotations: + description: | + {{ `{{` }} $num := query "count(ceph_osd_up == 0)" | first | value }}{{ `{{` }} $s := "" }}{{ `{{` }} if gt $num 1.0 }}{{ `{{` }} $s = "s" }}{{ `{{` }} end }}{{ `{{` }} $num }} OSD{{ `{{` }} $s }} down for over 5mins. The following OSD{{ `{{` }} $s }} {{ `{{` }} if eq $s "" }}is{{ `{{` }} else }}are{{ `{{` }} end }} down: {{ `{{` }}- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - {{ `{{` }} .Labels.ceph_daemon }} on {{ `{{` }} .Labels.hostname }} {{ `{{` }}- end }} + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down" + summary: "An OSD has been marked down" + expr: "ceph_health_detail{name=\"OSD_DOWN\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.2" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDNearFull" + annotations: + description: "One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull" + summary: "OSD(s) running low on free space (NEARFULL)" + expr: "ceph_health_detail{name=\"OSD_NEARFULL\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.3" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDFull" + annotations: + description: "An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full" + summary: "OSD full, writes blocked" + expr: "ceph_health_detail{name=\"OSD_FULL\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.6" + severity: "critical" + type: "ceph_default" + - alert: "CephOSDBackfillFull" + annotations: + description: "An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull" + summary: "OSD(s) too full for backfill operations" + expr: "ceph_health_detail{name=\"OSD_BACKFILLFULL\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTooManyRepairs" + annotations: + description: "Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs" + summary: "OSD reports a high number of read errors" + expr: "ceph_health_detail{name=\"OSD_TOO_MANY_REPAIRS\"} == 1" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTimeoutsPublicNetwork" + annotations: + description: "OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs." + summary: "Network issues delaying OSD heartbeats (public network)" + expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_FRONT\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDTimeoutsClusterNetwork" + annotations: + description: "OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs." + summary: "Network issues delaying OSD heartbeats (cluster network)" + expr: "ceph_health_detail{name=\"OSD_SLOW_PING_TIME_BACK\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDInternalDiskSizeMismatch" + annotations: + description: "One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch" + summary: "OSD size inconsistency error" + expr: "ceph_health_detail{name=\"BLUESTORE_DISK_SIZE_MISMATCH\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDeviceFailurePredicted" + annotations: + description: "The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info '. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#id2" + summary: "Device(s) predicted to fail soon" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDeviceFailurePredictionTooHigh" + annotations: + description: "The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany" + summary: "Too many devices are predicted to fail, unable to resolve" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH_TOOMANY\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.7" + severity: "critical" + type: "ceph_default" + - alert: "CephDeviceFailureRelocationIncomplete" + annotations: + description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use" + summary: "Device failure is predicted, but unable to relocate data" + expr: "ceph_health_detail{name=\"DEVICE_HEALTH_IN_USE\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephOSDFlapping" + annotations: + description: "OSD {{ `{{` }} $labels.ceph_daemon }} on {{ `{{` }} $labels.hostname }} was marked down and back up {{ `{{` }} $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s)." + documentation: "https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds" + summary: "Network issues are causing OSDs to flap (mark each other down)" + expr: "(rate(ceph_osd_up[5m]) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.4" + severity: "warning" + type: "ceph_default" + - alert: "CephOSDReadErrors" + annotations: + description: "An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors" + summary: "Device read errors detected" + expr: "ceph_health_detail{name=\"BLUESTORE_SPURIOUS_READ_ERRORS\"} == 1" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGImbalance" + annotations: + description: "OSD {{ `{{` }} $labels.ceph_daemon }} on {{ `{{` }} $labels.hostname }} deviates by more than 30% from average PG count." + summary: "PGs are not balanced across OSDs" + expr: | + # abs( + # ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / + # on (job) group_left avg(ceph_osd_numpg > 0) by (hostname) + # ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + < + scalar( + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + - + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) * 0.3 )) + ) + or + (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + > + scalar( + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + + + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) * 0.3 )) + ) + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.4.5" + severity: "warning" + type: "ceph_default" + - name: "mds" + rules: + - alert: "CephFilesystemDamaged" + annotations: + description: "Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages" + summary: "CephFS filesystem is damaged." + expr: "ceph_health_detail{name=\"MDS_DAMAGE\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.1" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemOffline" + annotations: + description: "All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down" + summary: "CephFS filesystem is offline" + expr: "ceph_health_detail{name=\"MDS_ALL_DOWN\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.3" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemDegraded" + annotations: + description: "One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded" + summary: "CephFS filesystem is degraded" + expr: "ceph_health_detail{name=\"FS_DEGRADED\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.4" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemMDSRanksLow" + annotations: + description: "The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max" + summary: "Ceph MDS daemon count is lower than configured" + expr: "ceph_health_detail{name=\"MDS_UP_LESS_THAN_MAX\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephFilesystemInsufficientStandby" + annotations: + description: "The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby" + summary: "Ceph filesystem standby daemons too few" + expr: "ceph_health_detail{name=\"MDS_INSUFFICIENT_STANDBY\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephFilesystemFailureNoStandby" + annotations: + description: "An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds" + summary: "MDS daemon failed, no further standby available" + expr: "ceph_health_detail{name=\"FS_WITH_FAILED_MDS\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.5" + severity: "critical" + type: "ceph_default" + - alert: "CephFilesystemReadOnly" + annotations: + description: "The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support." + documentation: "https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages" + summary: "CephFS filesystem in read only mode due to write error(s)" + expr: "ceph_health_detail{name=\"MDS_HEALTH_READ_ONLY\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.5.2" + severity: "critical" + type: "ceph_default" + - name: "mgr" + rules: + - alert: "CephMgrModuleCrash" + annotations: + description: "One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash" + summary: "A manager module has recently crashed" + expr: "ceph_health_detail{name=\"RECENT_MGR_MODULE_CRASH\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.6.1" + severity: "critical" + type: "ceph_default" + - alert: "CephMgrPrometheusModuleInactive" + annotations: + description: "The mgr/prometheus module at {{ `{{` }} $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'." + summary: "The mgr/prometheus module is not available" + expr: "up{job=\"ceph\"} == 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.6.2" + severity: "critical" + type: "ceph_default" + - name: "pgs" + rules: + - alert: "CephPGsInactive" + annotations: + description: "{{ `{{` }} $value }} PGs have been inactive for more than 5 minutes in pool {{ `{{` }} $labels.name }}. Inactive placement groups are not able to serve read/write requests." + summary: "One or more placement groups are inactive" + expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.1" + severity: "critical" + type: "ceph_default" + - alert: "CephPGsUnclean" + annotations: + description: "{{ `{{` }} $value }} PGs have been unclean for more than 15 minutes in pool {{ `{{` }} $labels.name }}. Unclean PGs have not recovered from a previous failure." + summary: "One or more placement groups are marked unclean" + expr: "ceph_pool_metadata * on(pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0" + for: "15m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.2" + severity: "warning" + type: "ceph_default" + - alert: "CephPGsDamaged" + annotations: + description: "During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' command." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged" + summary: "Placement group damaged, manual intervention needed" + expr: "ceph_health_detail{name=~\"PG_DAMAGED|OSD_SCRUB_ERRORS\"} == 1" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.4" + severity: "critical" + type: "ceph_default" + - alert: "CephPGRecoveryAtRisk" + annotations: + description: "Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full" + summary: "OSDs are too full for recovery" + expr: "ceph_health_detail{name=\"PG_RECOVERY_FULL\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.5" + severity: "critical" + type: "ceph_default" + - alert: "CephPGUnavailableBlockingIO" + annotations: + description: "Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability" + summary: "PG is unavailable, blocking I/O" + expr: "((ceph_health_detail{name=\"PG_AVAILABILITY\"} == 1) - scalar(ceph_health_detail{name=\"OSD_DOWN\"})) == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.3" + severity: "critical" + type: "ceph_default" + - alert: "CephPGBackfillAtRisk" + annotations: + description: "Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full" + summary: "Backfill operations are blocked due to lack of free space" + expr: "ceph_health_detail{name=\"PG_BACKFILL_FULL\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.7.6" + severity: "critical" + type: "ceph_default" + - alert: "CephPGNotScrubbed" + annotations: + description: "One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a 'clean' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub " + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed" + summary: "Placement group(s) have not been scrubbed" + expr: "ceph_health_detail{name=\"PG_NOT_SCRUBBED\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGsHighPerOSD" + annotations: + description: "The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).\n Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs" + summary: "Placement groups per OSD is too high" + expr: "ceph_health_detail{name=\"TOO_MANY_PGS\"} == 1" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPGNotDeepScrubbed" + annotations: + description: "One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed" + summary: "Placement group(s) have not been deep scrubbed" + expr: "ceph_health_detail{name=\"PG_NOT_DEEP_SCRUBBED\"} == 1" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - name: "nodes" + rules: + - alert: "CephNodeRootFilesystemFull" + annotations: + description: "Root volume is dangerously full: {{ `{{` }} $value | humanize }}% free." + summary: "Root filesystem is dangerously full" + expr: "node_filesystem_avail_bytes{mountpoint=\"/\"} / node_filesystem_size_bytes{mountpoint=\"/\"} * 100 < 5" + for: "5m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.1" + severity: "critical" + type: "ceph_default" + - alert: "CephNodeNetworkPacketDrops" + annotations: + description: "Node {{ `{{` }} $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ `{{` }} $labels.device }}." + summary: "One or more NICs reports packet drops" + expr: | + ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0050000000000000001 and ( + rate(node_network_receive_drop_total{device!="lo"}[1m]) + + rate(node_network_transmit_drop_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.2" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeNetworkPacketErrors" + annotations: + description: "Node {{ `{{` }} $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ `{{` }} $labels.device }}." + summary: "One or more NICs reports packet errors" + expr: | + ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) / ( + rate(node_network_receive_packets_total{device!="lo"}[1m]) + + rate(node_network_transmit_packets_total{device!="lo"}[1m]) + ) >= 0.0001 or ( + rate(node_network_receive_errs_total{device!="lo"}[1m]) + + rate(node_network_transmit_errs_total{device!="lo"}[1m]) + ) >= 10 + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.3" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeNetworkBondDegraded" + annotations: + summary: "Degraded Bond on Node {{ `{{` }} $labels.instance }}" + description: "Bond {{ `{{` }} $labels.master }} is degraded on Node {{ `{{` }} $labels.instance }}." + expr: | + node_bonding_slaves - node_bonding_active != 0 + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephNodeDiskspaceWarning" + annotations: + description: "Mountpoint {{ `{{` }} $labels.mountpoint }} on {{ `{{` }} $labels.nodename }} will be full in less than 5 days based on the 48 hour trailing fill rate." + summary: "Host filesystem free space is getting low" + expr: "predict_linear(node_filesystem_free_bytes{device=~\"/.*\"}[2d], 3600 * 24 * 5) *on(instance) group_left(nodename) node_uname_info < 0" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.8.4" + severity: "warning" + type: "ceph_default" + - alert: "CephNodeInconsistentMTU" + annotations: + description: "Node {{ `{{` }} $labels.instance }} has a different MTU size ({{ `{{` }} $value }}) than the median of devices named {{ `{{` }} $labels.device }}." + summary: "MTU settings across Ceph hosts are inconsistent" + expr: "node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( max by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0) == scalar( min by (device) (node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) != quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!=\"lo\"} > 0)) )" + labels: + severity: "warning" + type: "ceph_default" + - name: "pools" + rules: + - alert: "CephPoolGrowthWarning" + annotations: + description: "Pool '{{ `{{` }} $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours." + summary: "Pool growth rate may soon exceed capacity" + expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.9.2" + severity: "warning" + type: "ceph_default" + - alert: "CephPoolBackfillFull" + annotations: + description: "A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity." + summary: "Free space in a pool is too low for recovery/backfill" + expr: "ceph_health_detail{name=\"POOL_BACKFILLFULL\"} > 0" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephPoolFull" + annotations: + description: "A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{ `{{` }}- range query \"topk(5, sort_desc(ceph_pool_percent_used * on(pool_id) group_right ceph_pool_metadata))\" }} - {{ `{{` }} .Labels.name }} at {{ `{{` }} .Value }}% {{ `{{` }}- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes )" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full" + summary: "Pool is full - writes are blocked" + expr: "ceph_health_detail{name=\"POOL_FULL\"} > 0" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.9.1" + severity: "critical" + type: "ceph_default" + - alert: "CephPoolNearFull" + annotations: + description: "A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ). Also ensure that the balancer is active." + summary: "One or more Ceph pools are nearly full" + expr: "ceph_health_detail{name=\"POOL_NEAR_FULL\"} > 0" + for: "5m" + labels: + severity: "warning" + type: "ceph_default" + - name: "healthchecks" + rules: + - alert: "CephSlowOps" + annotations: + description: "{{ `{{` }} $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + summary: "OSD operations are slow to complete" + expr: "ceph_healthcheck_slow_ops > 0" + for: "30s" + labels: + severity: "warning" + type: "ceph_default" + - alert: "CephDaemonSlowOps" + for: "30s" + expr: "ceph_daemon_health_metrics{type=\"SLOW_OPS\"} > 0" + labels: + severity: 'warning' + type: 'ceph_default' + annotations: + summary: "{{ `{{` }} $labels.ceph_daemon }} operations are slow to complete" + description: "{{ `{{` }} $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)" + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops" + - name: "cephadm" + rules: + - alert: "CephadmUpgradeFailed" + annotations: + description: "The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue" + summary: "Ceph version upgrade has failed" + expr: "ceph_health_detail{name=\"UPGRADE_EXCEPTION\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.11.2" + severity: "critical" + type: "ceph_default" + - alert: "CephadmDaemonFailed" + annotations: + description: "A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start '" + summary: "A ceph daemon managed by cephadm is down" + expr: "ceph_health_detail{name=\"CEPHADM_FAILED_DAEMON\"} > 0" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.11.1" + severity: "critical" + type: "ceph_default" + - alert: "CephadmPaused" + annotations: + description: "Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume'" + documentation: "https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused" + summary: "Orchestration tasks via cephadm are PAUSED" + expr: "ceph_health_detail{name=\"CEPHADM_PAUSED\"} > 0" + for: "1m" + labels: + severity: "warning" + type: "ceph_default" + - name: "PrometheusServer" + rules: + - alert: "PrometheusJobMissing" + annotations: + description: "The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance." + summary: "The scrape job for Ceph is missing from Prometheus" + expr: "absent(up{job=\"ceph\"})" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.12.1" + severity: "critical" + type: "ceph_default" + - name: "rados" + rules: + - alert: "CephObjectMissing" + annotations: + description: "The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound" + summary: "Object(s) marked UNFOUND" + expr: "(ceph_health_detail{name=\"OBJECT_UNFOUND\"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1" + for: "30s" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.10.1" + severity: "critical" + type: "ceph_default" + - name: "generic" + rules: + - alert: "CephDaemonCrash" + annotations: + description: "One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command." + documentation: "https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash" + summary: "One or more Ceph daemons have crashed, and are pending acknowledgement" + expr: "ceph_health_detail{name=\"RECENT_CRASH\"} == 1" + for: "1m" + labels: + oid: "1.3.6.1.4.1.50495.1.2.1.1.2" + severity: "critical" + type: "ceph_default" + +{{- end -}} \ No newline at end of file diff --git a/templates/serviceMonitor-alertmanager.yaml b/templates/serviceMonitor-alertmanager.yaml new file mode 100644 index 0000000..c7f08fc --- /dev/null +++ b/templates/serviceMonitor-alertmanager.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + app.kubernetes.io/version: {{ $.Chart.Version }} + name: alertmanager + namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + +spec: + endpoints: + - interval: 30s + port: web + - interval: 30s + port: reloader-web + namespaceSelector: + matchNames: + - "{{ .Values.nfc_monitoring.alert_manager.namespace }}" + selector: + matchLabels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/instance: main + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: {{ $.Chart.Name }} diff --git a/values.yaml b/values.yaml new file mode 100644 index 0000000..b7a4793 --- /dev/null +++ b/values.yaml @@ -0,0 +1,282 @@ +--- + +# All values within this helm chart values.yaml file are under namespace `nfc_monitoring`. +# this provides the opportunity to include this helm chart as a dependency without +# variable collision + +nfc_monitoring: + + kubernetes: + cluster_dns_name: cluster.local + networking: calico + + + alert_manager: + image: + name: quay.io/prometheus/alertmanager + tag: 'v0.26.0' + namespace: alerting + + + grafana: + + admin_user: admin + admin_password: admin + + image: + name: grafana/grafana + tag: '10.1.2' # '10.0.5' + + namespace: grafana + + replicas: 1 + + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 100 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/storage + operator: DoesNotExist + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - prometheus + topologyKey: kubernetes.io/hostname + weight: 10 + + + grafana_agent: + image: + name: grafana/agent + tag: 'v0.36.1' + namespace: monitoring + + + loki: + + enabled: false + + image: + name: grafana/loki + tag: 2.7.4 + + namespace: logging + + # service name and port are used for the connection to your loki instance + service_name: loki-gateway + service_port: 80 + + ServiceMonitor: + selector: + matchLabels: + app.kubernetes.io/name: loki + app.kubernetes.io/component: logging + + + kube_monitor_proxy: + namespace: monitoring + + + kube_rbac_proxy: + # This image is used as part of kube-monitor-proxy. + image: + name: quay.io/brancz/kube-rbac-proxy + tag: 'v0.14.2' + + + kube_state_metrics: + image: + name: registry.k8s.io/kube-state-metrics/kube-state-metrics + tag: 'v2.8.1' + namespace: monitoring + + + prometheus: + + image: + name: prom/prometheus + tag: 'v2.47.0' + + namespace: monitoring + + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 100 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/storage + operator: DoesNotExist + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - prometheus + topologyKey: kubernetes.io/hostname + weight: 10 + + # List of namespaces that prometheus is to monitor + # used to create Roles and RoleBindings + monitor_namespaces: + - alerting + - default + # - ceph + - grafana + - monitoring + # - kube-dashboard + # - kube-metrics + - kube-policy + - kube-system + - logging + # - mariadb + # - olm + # - operators + + # Deploy a generate policy for kyverno to create Role and RoleBindings + # for the prometheus service account so it can monitor + # new/existing namespaces + kyverno_role_policy: true + + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 40Gi + + + prometheus_adaptor: + + image: + name: registry.k8s.io/prometheus-adapter/prometheus-adapter + tag: 'v0.11.1' + + namespace: monitoring + + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - preference: + matchExpressions: + - key: node-role.kubernetes.io/worker + operator: Exists + weight: 100 + - preference: + matchExpressions: + - key: node-role.kubernetes.io/storage + operator: DoesNotExist + weight: 100 + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - prometheus + topologyKey: kubernetes.io/hostname + weight: 10 + + + additions: + + ceph: + + enabled: true + + namespace: ceph + + PrometheusRules: true + + ServiceMonitor: + + selector: + matchLabels: + app: rook-ceph-mgr + + # Add sidecar to grafana pod to load dashboards from configMap + dashboard_sidecar: + + enabled: true + + image: + name: ghcr.io/kiwigrid/k8s-sidecar + tag: '1.24.5' + + label_name: grafana_dashboard + label_value: "1" + + +loki_instance: + image: + name: grafana/loki + tag: 2.7.4 + # tag: 2.9.0 + namespace: loki + + +oncall_instance: + image: + name: grafana/oncall + tag: v1.1.40 + + +# oncall: + +# # image: +# # # Grafana OnCall docker image repository +# # repository: grafana/oncall +# # tag: v1.1.38 +# # pullPolicy: Always + +# service: +# enabled: false +# type: LoadBalancer +# port: 8080 +# annotations: {} + +# engine: +# replicaCount: 1 +# resources: +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 100m +# memory: 128Mi + +# celery: +# replicaCount: 1 +# resources: +# limits: +# cpu: 100m +# memory: 128Mi +# requests: +# cpu: 100m +# memory: 128Mi +# database: +# type: none diff --git a/website-template b/website-template new file mode 160000 index 0000000..992b548 --- /dev/null +++ b/website-template @@ -0,0 +1 @@ +Subproject commit 992b54805b8b6c78a3d2a5ea7de71c7be2b070c8