Compare commits
70 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 014156eb05 | |||
| 376cf3ae9f | |||
| c76b753618 | |||
| 6dd413a0d4 | |||
| ffc0f84669 | |||
| 5add73c411 | |||
| 239837ecbf | |||
| ac812c2c7b | |||
| 9b12fcf5bc | |||
| d4e529aec9 | |||
| e7480105f7 | |||
| c8ea929873 | |||
| 899c6a3d78 | |||
| 817b838655 | |||
| b7cfebf92b | |||
| e3648324f2 | |||
| a26f887fa4 | |||
| 8f53b0fa07 | |||
| 82cd91a859 | |||
| 944d615331 | |||
| 4198ac78da | |||
| 3a4ca30e68 | |||
| e2540a955f | |||
| 89f0feae08 | |||
| 0d1d0a34d8 | |||
| 83fb30b4ed | |||
| 7add74fbb1 | |||
| c742eea38d | |||
| 836cc111e0 | |||
| d189dfe0ee | |||
| 2331bcbba3 | |||
| 8b966b0f0a | |||
| 3281cd6552 | |||
| 3f2204cf31 | |||
| cbac640af3 | |||
| 64d612047f | |||
| 4e742ee2d5 | |||
| 4ced83b90d | |||
| a28126555f | |||
| aba3cb22a9 | |||
| 63c0381bd1 | |||
| 9e3f33ae56 | |||
| 1dc85b4518 | |||
| a615b4922f | |||
| 6ffcc76acf | |||
| 8f49ab08b4 | |||
| 6c8298b0bb | |||
| 16965c7304 | |||
| fe2cc1352a | |||
| 38730de0a7 | |||
| 9880c5d988 | |||
| b5d7a24d28 | |||
| fe28c6a24c | |||
| 59316081cd | |||
| 8699d66347 | |||
| bf44961e02 | |||
| 8631e56028 | |||
| 53151ac6b9 | |||
| 762a9fa387 | |||
| 81136232f7 | |||
| 534377edbd | |||
| bb05212755 | |||
| 955515de35 | |||
| e0bc34c12f | |||
| 3865028541 | |||
| 5d808ee753 | |||
| 8fff819081 | |||
| 286d2827ad | |||
| 0feb8794b2 | |||
| df126a576d |
7
.cz.yaml
Normal file
7
.cz.yaml
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
commitizen:
|
||||||
|
bump_message: "build(version): bump version $current_version \u2192 $new_version"
|
||||||
|
changelog_incremental: false
|
||||||
|
name: cz_conventional_commits
|
||||||
|
tag_format: $major.$minor.$patch$prerelease
|
||||||
|
update_changelog_on_bump: true
|
||||||
|
version: 0.1.0
|
||||||
35
.gitlab-ci.yml
Normal file
35
.gitlab-ci.yml
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
variables:
|
||||||
|
MY_PROJECT_ID: "50510268"
|
||||||
|
GIT_SYNC_URL: "https://$GITHUB_USERNAME_ROBOT:$GITHUB_TOKEN_ROBOT@github.com/NoFussComputing/kubernetes_monitoring.git"
|
||||||
|
|
||||||
|
|
||||||
|
PAGES_ENVIRONMENT_PATH: projects/kubernetes_monitoring/
|
||||||
|
|
||||||
|
include:
|
||||||
|
- project: nofusscomputing/projects/gitlab-ci
|
||||||
|
ref: development
|
||||||
|
file:
|
||||||
|
- template/automagic.gitlab-ci.yaml
|
||||||
|
#- template: Jobs/Container-Scanning.gitlab-ci.yml # see https://gitlab.com/gitlab-org/gitlab/-/issues/381665
|
||||||
|
|
||||||
|
|
||||||
|
Website.Submodule.Deploy:
|
||||||
|
extends: .submodule_update_trigger
|
||||||
|
variables:
|
||||||
|
SUBMODULE_UPDATE_TRIGGER_PROJECT: nofusscomputing/infrastructure/website
|
||||||
|
environment:
|
||||||
|
url: https://nofusscomputing.com/$PAGES_ENVIRONMENT_PATH
|
||||||
|
name: Documentation
|
||||||
|
rules:
|
||||||
|
- if: # condition_dev_branch_push
|
||||||
|
$CI_COMMIT_BRANCH == "development" &&
|
||||||
|
$CI_PIPELINE_SOURCE == "push"
|
||||||
|
exists:
|
||||||
|
- '{docs/**,pages/**}/*.md'
|
||||||
|
changes:
|
||||||
|
paths:
|
||||||
|
- '{docs/**,pages/**}/*.md'
|
||||||
|
compare_to: 'master'
|
||||||
|
when: always
|
||||||
|
|
||||||
|
- when: never
|
||||||
8
.gitmodules
vendored
Normal file
8
.gitmodules
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[submodule "gitlab-ci"]
|
||||||
|
path = gitlab-ci
|
||||||
|
url = https://gitlab.com/nofusscomputing/projects/gitlab-ci.git
|
||||||
|
branch = development
|
||||||
|
[submodule "website-template"]
|
||||||
|
path = website-template
|
||||||
|
url = https://gitlab.com/nofusscomputing/infrastructure/website-template.git
|
||||||
|
branch = development
|
||||||
30
.helmignore
Normal file
30
.helmignore
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# Patterns to ignore when building packages.
|
||||||
|
# This supports shell glob matching, relative path matching, and
|
||||||
|
# negation (prefixed with !). Only one pattern per line.
|
||||||
|
.DS_Store
|
||||||
|
# Common VCS dirs
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
.bzr/
|
||||||
|
.bzrignore
|
||||||
|
.hg/
|
||||||
|
.hgignore
|
||||||
|
.svn/
|
||||||
|
# Common backup files
|
||||||
|
*.swp
|
||||||
|
*.bak
|
||||||
|
*.tmp
|
||||||
|
*.orig
|
||||||
|
*~
|
||||||
|
# Various IDEs
|
||||||
|
.project
|
||||||
|
.idea/
|
||||||
|
*.tmproj
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# git repo files
|
||||||
|
monitoring-mixins/
|
||||||
|
ceph-mixins/
|
||||||
|
gitlab-ci/
|
||||||
|
website-template/
|
||||||
|
docs/
|
||||||
83
CHANGELOG.md
Normal file
83
CHANGELOG.md
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
## 0.1.0 (2023-09-27)
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
|
||||||
|
- **kubestateproxy**: [b7cfebf9](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/b7cfebf92b3983f70df3439a516e0e355a505332) - bump proxy cpu limit [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [a26f887f](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/a26f887fa41ef0eb0ff6ade8584335e228442d6a) - use named pvc so it's reused [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **ceph**: [944d6153](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/944d6153312703f41f5d579b3326684c0f7312a7) - PromRule CephPGImbalance adjusted to group by node [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [89f0feae](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/89f0feae08bde0e1545c492b58dd1fbdb9ac3a45) - dont use operator readiness prob [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
|
||||||
|
### Code Refactor
|
||||||
|
|
||||||
|
- **dashboard**: [ffc0f846](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/ffc0f8466912b5598d7df9078ce30c8a3424fc34) - nfc cluster overview to use source variable [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) ]
|
||||||
|
- **prometheus**: [899c6a3d](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/899c6a3d786e147024bd63464e89637a7180b909) - adjusted role/rolbinding manifest to loop [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana_agent**: [817b8386](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/817b83865555cbfa49dad03b2beffa2a93c45124) - clean up config file [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [7add74fb](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/7add74fbb1cae8613df3a7405889198c78b091f0) - yaml object ordering [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **prometheus**: [3281cd65](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3281cd6552339ee16c7ae94ae9676f26c3d8000e) - use values ns var [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [64d61204](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/64d612047f35349d149c8c0d4d1732908284ca9f) - use loki ns var [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
|
||||||
|
### Continious Integration
|
||||||
|
|
||||||
|
- [0feb8794](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/0feb8794b2bcc41357ff8d8bf30f1856eb193e7d) - typo in github url [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
|
||||||
|
### Documentaton / Guides
|
||||||
|
|
||||||
|
- [c76b7536](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/c76b753618b19c8fe088bc828247eb16db564c60) - update [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) ]
|
||||||
|
- **metrics_storage**: [6dd413a0](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/6dd413a0d42cdaf53bd3034645be60eb0b388afe) - added docs on how to configure external metrics routing [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
|
||||||
|
- [d4e529ae](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/d4e529aec9c9d9d7a0982c061f44f853de740fd6) - add more features [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [82cd91a8](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/82cd91a8596afca592292840cd6073ba5cb1e18b) - added missing attribution [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [4198ac78](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/4198ac78da4733b36b94587dce9fd0a19664106d) - add blurb on values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [6ffcc76a](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/6ffcc76acf44e3fa124d21a5f2c3b6d169d342b3) - added initial docs [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
- **grafana**: [5add73c4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/5add73c411a595848d4c0714bab9aac3dc034118) - DataSources configurable in values.yaml [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
|
||||||
|
- **prometheus**: [239837ec](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/239837ecbf8f5b233fcec418d063cd16b930a4f1) - ability to add additional config settings [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
|
||||||
|
- **prometheus**: [ac812c2c](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/ac812c2c7ba6de7dd674088c5c5e2ebe29dc14a6) - thanos sidecar option added [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
|
||||||
|
- **grafana_dashboard**: [e7480105](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e7480105f7d0397bb20656a1d9c623461e408a6b) - nfc custom, cluster overview [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **kyverno**: [c8ea9298](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/c8ea92987318f10a5c14af59ac4d45bc8a549061) - add clusterpolicy role and rolebinding [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana_agent**: [e3648324](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e3648324f234c82202999e3198ba3a493384c1e4) - attach hostname to logs and metrics [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [8f53b0fa](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8f53b0fa07eef6153e4583f6fe86f3f7fd5027f4) - sidecar for loading dashboards from configmap [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **servicemonitor**: [3a4ca30e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3a4ca30e68c85758fbfa1325a7fc6c17c81926ad) - ceph use cluster name for instance [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **servicemonitor**: [e2540a95](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e2540a955f08b7dcafa159abb773f095b920738b) - for prometheus, use pod name for instance [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [0d1d0a34](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/0d1d0a34d8738ae408226479e0249f5cd1873551) - add affinity to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **prometheus**: [83fb30b4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/83fb30b4ed757d50b9f53bf3eba3de9cc38e0b55) - add storage to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **prometheus_adaptor**: [c742eea3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/c742eea38d8586f3de42db377f953b35c593966c) - add affinity to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **prometheus**: [836cc111](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/836cc111e092046988aba7b9b1494b1432313add) - add affinity to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **logging**: [2331bcbb](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/2331bcbba35d561b22b35f7aa8c19cea087999fd) - add log files from /var/log [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana-agent**: [8b966b0f](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8b966b0f0ac33571128c959ed02d985cc805668f) - expand env vars in config [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **deployment**: [3f2204cf](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3f2204cf3132deea4e4750ec7cf97e512986423e) - grafana-agent dnspoly clusterfirst [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **promtail**: [cbac640a](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/cbac640af3ffdb16c6b78ecfcf86205b7804e0fd) - filter devices to not include temp/virtual [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [4e742ee2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/4e742ee2d5dab3e2781d62e075f4a8adcf9c6b2a) - node scraper set to 5s [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **loik**: [4ced83b9](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/4ced83b90d98e4805550cb2b5fda55c705e8c8bb) - be able to specify full loki url
|
||||||
|
- **loki**: [a2812655](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/a28126555f594aa805016c5948151e860e5c93ad) - add prometheus alerts and rules for mixins [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **loki**: [aba3cb22](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/aba3cb22a97d180185eef2350988720542cb1298) - removed service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [63c0381b](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/63c0381bd15b077537667979ec8e3dd2e6503090) - Calico service and monitor added [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **node**: [9e3f33ae](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/9e3f33ae5679e92b1d5097fb808943c59f31aa3b) - added dashboard [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **ceph**: [1dc85b45](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/1dc85b45180b56c7ed8f2a27246f0771f8672939) - added dashboard [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **alertmanager**: [a615b492](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/a615b4922f59024c17056f6f6bc802ff2188c787) - added dashboard [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [8f49ab08](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8f49ab08b4759921b6577384b5456940d5026288) - loki data source [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [6c8298b0](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/6c8298b0bb57e00f7aa7ee0deae7d9175c9a5347) - loki service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [16965c73](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/16965c73046c4a4d463b5a36e246920fcad3f3fe) - ceph prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [fe2cc135](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/fe2cc1352a1bd5dd49f2d3a3ac64b12916ad9dbf) - ceph service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [9880c5d9](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/9880c5d988fce19aa59aa143c3c2c6e9cf5df88b) - Kubeprometheus Prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [b5d7a24d](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/b5d7a24d28f9f418d9bca5e47d64c18cd9309986) - Kubernetes control plane Prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [fe28c6a2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/fe28c6a24cc786fbcc818f21f6dd971b7776110a) - Node-Exporter Prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [59316081](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/59316081cd02676b25e88c79c99bc142918b8c07) - Kubelet service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [8699d663](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8699d66347106ee85ac775acc625f069494b02b7) - CoreDNS service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [bf44961e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/bf44961e02aa076f4dc0a5241359372d0e391b37) - CAdvisor service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **metrics**: [8631e560](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8631e56028e05b7c8aab6d239114d1c3b3304862) - api service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [53151ac6](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/53151ac6b960dff80e6168486ff3a0359745347c) - alert manager data source [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [762a9fa3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/762a9fa387be7e0cf9919075fd94705940a92a0b) - proxy deployment for kube scheduler controller metrics [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [81136232](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/81136232f773429b2e26ecedc79a2a660b2d9abc) - kubestate metrics deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [534377ed](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/534377edbdafab9c3939288330fc3e845445c898) - alert manager deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **service_monitor**: [bb052127](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/bb0521275581f44f6000829316f0660cde52c5ec) - use grafana agent for node node-exporter [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **service_monitor**: [955515de](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/955515de351d3b725bfbe8b3e7cb5f8edd328cb5) - use grafana agent for node exporter [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [e0bc34c1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e0bc34c12ff88b8e75806dc238b00e1ef464129b) - grafana agent deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- **grafana**: [38650285](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3865028541d6023f380745a2549654c651431a1b) - prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [5d808ee7](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/5d808ee753afff17373dd8c4232b00bbf8904f16) - prometheus adaptor deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [8fff8190](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8fff819081bf0b274f0afe7fe9f5cee6cd85ef18) - prometheus deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [286d2827](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/286d2827ad1c30f95814494425bcda96a8a9874d) - grafana deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
- [df126a57](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/df126a576d6fb81ae7c4caa07154502b5d119f0f) - initial template, docs and ci [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
|
||||||
|
|
||||||
|
## 0.0.1 (2023-09-27)
|
||||||
26
Chart.yaml
Normal file
26
Chart.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
apiVersion: v2
|
||||||
|
name: nfc-monitoring
|
||||||
|
description: |
|
||||||
|
This helm chart installs the required components for full stack monitoring.
|
||||||
|
|
||||||
|
The purpose of this chart is to provide all components required to monitor kubernetes within its entirety.
|
||||||
|
Storage of the monitoring data is not part of this chart nor will it be included. For this reason this chart can be added and removed without loosing your monitoring data.
|
||||||
|
|
||||||
|
For this chart to function correctly the following operators are required:
|
||||||
|
|
||||||
|
- [grafana](https://grafana-operator.github.io/grafana-operator/docs/installation/helm/)
|
||||||
|
|
||||||
|
- [prometheus](https://prometheus-operator.dev/docs/user-guides/getting-started/)
|
||||||
|
|
||||||
|
Full Docs available at http://nofusscomputing.com/projects/kubernetes_monitoring.
|
||||||
|
|
||||||
|
type: application
|
||||||
|
|
||||||
|
|
||||||
|
version: 0.1.0
|
||||||
|
|
||||||
|
|
||||||
|
appVersion: "0.1.0"
|
||||||
|
|
||||||
|
|
||||||
|
dependencies: []
|
||||||
@ -49,6 +49,10 @@ All contributions for this project must conducted from [Gitlab](https://gitlab.c
|
|||||||
For further details on contributing please refer to the [contribution guide](CONTRIBUTING.md).
|
For further details on contributing please refer to the [contribution guide](CONTRIBUTING.md).
|
||||||
|
|
||||||
|
|
||||||
|
## Documentation
|
||||||
|
|
||||||
|
Documentation for this project can be found at <http://nofusscomputing.com/projects/kubernetes_monitoring>
|
||||||
|
|
||||||
## Other
|
## Other
|
||||||
|
|
||||||
This repo is release under this [license](LICENSE)
|
This repo is release under this [license](LICENSE)
|
||||||
|
|||||||
0
docs/articles/index.md
Normal file
0
docs/articles/index.md
Normal file
0
docs/contact.md
Normal file
0
docs/contact.md
Normal file
0
docs/index.md
Normal file
0
docs/index.md
Normal file
0
docs/operations/index.md
Normal file
0
docs/operations/index.md
Normal file
0
docs/projects/index.md
Normal file
0
docs/projects/index.md
Normal file
199
docs/projects/kubernetes_monitoring/index.md
Normal file
199
docs/projects/kubernetes_monitoring/index.md
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
---
|
||||||
|
title: Kubernetes Monitoring Helm Chart
|
||||||
|
description: How to use No Fuss Computings kubernetes monitoring helm chart for full stack monitoring.
|
||||||
|
date: 2023-09-19
|
||||||
|
template: project.html
|
||||||
|
about: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring
|
||||||
|
---
|
||||||
|
|
||||||
|
This Helm chart deploys the components needed for full stack monitoring.
|
||||||
|
|
||||||
|
What this chart is:
|
||||||
|
|
||||||
|
- Collectors for metrics and logging
|
||||||
|
|
||||||
|
- Distribution / Routing of metrics/logging
|
||||||
|
|
||||||
|
- Data visualization
|
||||||
|
|
||||||
|
- Ready for logging long term storag integration, in particular Loki
|
||||||
|
|
||||||
|
- Ready for metrics long term storage integration
|
||||||
|
|
||||||
|
What this chart is not:
|
||||||
|
|
||||||
|
- A complete monitoring stack inclusive of long term storage.
|
||||||
|
|
||||||
|
Intentionally we have kept this helm chart to the monitoring components only. Keeping the monitoring components seperate from storage is advantageous. for all intents and purposes this helm chart is ephemeral.
|
||||||
|
|
||||||
|
This helm chart started off with components from multiple open-source projects. As such attribution is warrented, so head on over to thier projects and give them a star. Projects used to create the building blocks of this helm chart are
|
||||||
|
|
||||||
|
- [ceph](https://github.com/ceph/ceph) _mixin alerts/rules_
|
||||||
|
|
||||||
|
- [Kube prometheus](https://github.com/prometheus-operator/kube-prometheus)
|
||||||
|
|
||||||
|
- [prometheus adaptor](https://github.com/kubernetes-sigs/prometheus-adapter)
|
||||||
|
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Alert Manager
|
||||||
|
|
||||||
|
- Grafana as the visualization frontend
|
||||||
|
|
||||||
|
- Dashboards
|
||||||
|
|
||||||
|
- Ceph
|
||||||
|
|
||||||
|
- Cluster Overview
|
||||||
|
|
||||||
|
- Node Exporter Full
|
||||||
|
|
||||||
|
- Alert Manager
|
||||||
|
|
||||||
|
- Sidecar to load dashboards from ConfigMaps _Optional_
|
||||||
|
|
||||||
|
- Grafana-Agent
|
||||||
|
|
||||||
|
- daemonset for cluster nodes for exposing metrics and an injecter to loki for node and container logs.
|
||||||
|
|
||||||
|
- Loki integration for storing logs
|
||||||
|
|
||||||
|
- [Mixins](https://github.com/monitoring-mixins/website) Compatible _(partial)_, Tested with:
|
||||||
|
|
||||||
|
- alert-manager
|
||||||
|
|
||||||
|
- ceph
|
||||||
|
|
||||||
|
- coreDNS
|
||||||
|
|
||||||
|
- Kubernetes
|
||||||
|
|
||||||
|
- Loki [helm chart](https://artifacthub.io/packages/helm/grafana/loki) included dashboards, see [loki repo](https://github.com/grafana/loki/tree/f4ab1e3e89ac66e1848764dc17826abde929fdc5/production/loki-mixin-compiled) for dashboards_
|
||||||
|
|
||||||
|
- Node-Exporter
|
||||||
|
|
||||||
|
- Prometheus
|
||||||
|
|
||||||
|
- Promtail
|
||||||
|
|
||||||
|
- Prometheus
|
||||||
|
|
||||||
|
- Thanos sidecar _Optional_
|
||||||
|
|
||||||
|
- Service monitors
|
||||||
|
|
||||||
|
- API Server
|
||||||
|
|
||||||
|
- CAdvisor
|
||||||
|
|
||||||
|
- Calico, _Optional_
|
||||||
|
|
||||||
|
> enable calico metrics with `kubectl patch felixconfiguration default --type merge --patch '{"spec":{"prometheusMetricsEnabled": true}}'`
|
||||||
|
|
||||||
|
- Ceph _Optional_
|
||||||
|
|
||||||
|
- CoreDNS
|
||||||
|
|
||||||
|
- Grafana
|
||||||
|
|
||||||
|
- Kube-Controler-Manager
|
||||||
|
|
||||||
|
- Kube-Scheduler
|
||||||
|
|
||||||
|
- Kubelet
|
||||||
|
|
||||||
|
- Kube-state-metrics
|
||||||
|
|
||||||
|
- Node
|
||||||
|
|
||||||
|
- Node-Exporter
|
||||||
|
|
||||||
|
- Prometheus
|
||||||
|
|
||||||
|
- Prometheus-Adaptor
|
||||||
|
|
||||||
|
- Thanos
|
||||||
|
|
||||||
|
- kyverno policies _(optional, set in values.yaml)_
|
||||||
|
|
||||||
|
- auto deploy policy for prometheus role and rolebinding to access other namespaces
|
||||||
|
|
||||||
|
- `values.yaml` customization of components. _See values.yaml for more details._
|
||||||
|
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
This chart as the following **dependencies:**
|
||||||
|
|
||||||
|
- [Grafana Operator](https://artifacthub.io/packages/olm/community-operators/grafana-operator)
|
||||||
|
|
||||||
|
- [Prometheus Operator](https://artifacthub.io/packages/olm/community-operators/prometheus)
|
||||||
|
|
||||||
|
These dependencies must be present before installing this chart. to install run the following commands:
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
|
||||||
|
git clone -b development https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring.git
|
||||||
|
|
||||||
|
helm upgrade -i nfc_monitoring kubernetes_monitoring/
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
## Metrics Storage
|
||||||
|
|
||||||
|
There are two different ways that this chart porvides to route your metrics to long-term storage.
|
||||||
|
|
||||||
|
- Prometheus `remote_write`
|
||||||
|
|
||||||
|
- Thanos Side Car container
|
||||||
|
|
||||||
|
|
||||||
|
### Prometheus Remote Write
|
||||||
|
|
||||||
|
Using this method will have prometheus pushing it's metrics to another service to store its metrics. This option for example could be used to push metrics to Grafana Mimir. To configure add the following to the values.yaml file at path `nfc_monitoring.prometheus.additional`:
|
||||||
|
|
||||||
|
``` yaml
|
||||||
|
|
||||||
|
remoteWrite:
|
||||||
|
- name: mimir
|
||||||
|
url: http://mimir-gateway.metrics.svc.cluster.local/api/v1/push
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
Ensure that you set the url to the correct url.
|
||||||
|
|
||||||
|
|
||||||
|
### Thanos Side Car
|
||||||
|
|
||||||
|
This method will have a Thanos container deployed within the prometheus pod. This container will then read and upload the Prometheus containers tsdb to the configured storage. To configure add the following to the values.yaml
|
||||||
|
|
||||||
|
- `monitoring.thanos.sidecar.enabled` set to bool `true`
|
||||||
|
|
||||||
|
- `nfc_monitoring.thanos.sidecar.config` updated to include the sidecar config.
|
||||||
|
|
||||||
|
For example to configure the Thanos sideCar to upload Prometheus metrics to S3 storage use (updating values to suit your environment):
|
||||||
|
|
||||||
|
``` yaml
|
||||||
|
type: S3
|
||||||
|
config:
|
||||||
|
bucket: "thanos-metrics"
|
||||||
|
endpoint: "rook-ceph-rgw-earth.ceph.svc:80"
|
||||||
|
access_key: "7J5NM2MNCDB4T4Y9OKJ5"
|
||||||
|
secret_key: "t9r69RzZdWEBL3NCKiUIpDk6j5625xc6HucusiGG"
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
## Template Values
|
||||||
|
|
||||||
|
The values file included in this helm chart is below.
|
||||||
|
|
||||||
|
!!! note
|
||||||
|
This values file is from the development branch and as such may not reflect the current version you have deployed. If you require a specific version, head on over to the git repository and select the git tag that matches the version you are after.
|
||||||
|
|
||||||
|
``` yaml title="values.yaml" linenums="1"
|
||||||
|
|
||||||
|
--8<-- "values.yaml"
|
||||||
|
|
||||||
|
```
|
||||||
0
docs/tags.md
Normal file
0
docs/tags.md
Normal file
87
docs/task-doc-template.md
Normal file
87
docs/task-doc-template.md
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
|
||||||
|
|
||||||
|
short summary of the task file
|
||||||
|
|
||||||
|
## {Task Name}
|
||||||
|
|
||||||
|
- **Name**:
|
||||||
|
|
||||||
|
- **Description**:
|
||||||
|
|
||||||
|
- **Module**:
|
||||||
|
|
||||||
|
- **Arguments**:
|
||||||
|
|
||||||
|
-
|
||||||
|
|
||||||
|
- **Conditional**:
|
||||||
|
|
||||||
|
- **Tags**:
|
||||||
|
|
||||||
|
-
|
||||||
|
|
||||||
|
## {Task Name}
|
||||||
|
|
||||||
|
- **Name**:
|
||||||
|
|
||||||
|
- **Description**:
|
||||||
|
|
||||||
|
- **Module**:
|
||||||
|
|
||||||
|
- **Arguments**:
|
||||||
|
|
||||||
|
-
|
||||||
|
|
||||||
|
- **Registers**:
|
||||||
|
|
||||||
|
- **Conditional**:
|
||||||
|
|
||||||
|
- **Tags**:
|
||||||
|
|
||||||
|
-
|
||||||
|
|
||||||
|
|
||||||
|
## Variables
|
||||||
|
|
||||||
|
The following variables can be customized in this task file:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
variable_name: "default_value"
|
||||||
|
```
|
||||||
|
|
||||||
|
- `variable_name`: Description of the variable.
|
||||||
|
|
||||||
|
## Tags
|
||||||
|
|
||||||
|
The tasks in this task file are tagged with the following tags:
|
||||||
|
|
||||||
|
-
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To use this Ansible task file, you can include it in your playbook or role and provide values for the required variables. Here's an example of how you can use this task file:
|
||||||
|
|
||||||
|
1. Create a playbook (e.g., `your_playbook.yaml`) and define the necessary variables:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
---
|
||||||
|
|
||||||
|
- hosts: your_hosts
|
||||||
|
vars:
|
||||||
|
variable_name: "value"
|
||||||
|
|
||||||
|
tasks:
|
||||||
|
- include_tasks: path/to/task_file.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Create a separate file for the task file (e.g., `task_file.yaml`) and copy the content of the task file into it.
|
||||||
|
|
||||||
|
3. Run the playbook:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
ansible-playbook your_playbook.yaml
|
||||||
|
```
|
||||||
|
|
||||||
|
Make sure to replace the placeholder values (`variable_name`, `value`) with the appropriate values for your setup.
|
||||||
|
|
||||||
|
Note: You may need to adjust the playbook structure and additional tasks based on your specific requirements and the tasks you want to execute.
|
||||||
1301
files/dashboard-summary.json
Normal file
1301
files/dashboard-summary.json
Normal file
File diff suppressed because it is too large
Load Diff
1
gitlab-ci
Submodule
1
gitlab-ci
Submodule
Submodule gitlab-ci added at a5a9fa4437
30
mkdocs.yml
Normal file
30
mkdocs.yml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
INHERIT: website-template/mkdocs.yml
|
||||||
|
|
||||||
|
docs_dir: 'docs'
|
||||||
|
|
||||||
|
repo_name: Kubernetes Monitoring Helm Chart
|
||||||
|
repo_url: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring
|
||||||
|
edit_uri: '/-/ide/project/nofusscomputing/projects/kubernetes_monitoring/edit/development/-/docs/'
|
||||||
|
|
||||||
|
nav:
|
||||||
|
- Home: index.md
|
||||||
|
|
||||||
|
- Articles:
|
||||||
|
|
||||||
|
- articles/index.md
|
||||||
|
|
||||||
|
- Projects:
|
||||||
|
|
||||||
|
- projects/index.md
|
||||||
|
|
||||||
|
- Kubernetes Monitoring:
|
||||||
|
|
||||||
|
- projects/kubernetes_monitoring/index.md
|
||||||
|
|
||||||
|
|
||||||
|
- Operations:
|
||||||
|
|
||||||
|
- operations/index.md
|
||||||
|
|
||||||
|
- Contact Us: contact.md
|
||||||
|
|
||||||
21
templates/APIService-prometheus-adapter.yaml
Normal file
21
templates/APIService-prometheus-adapter.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apiregistration.k8s.io/v1
|
||||||
|
kind: APIService
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: v1beta1.metrics.k8s.io
|
||||||
|
spec:
|
||||||
|
group: metrics.k8s.io
|
||||||
|
groupPriorityMinimum: 100
|
||||||
|
insecureSkipTLSVerify: true
|
||||||
|
service:
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
|
version: v1beta1
|
||||||
|
versionPriority: 100
|
||||||
39
templates/AlertManager-k8s.yaml
Normal file
39
templates/AlertManager-k8s.yaml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: Alertmanager
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: main
|
||||||
|
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||||
|
spec:
|
||||||
|
image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}"
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
podMetadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
replicas: 3
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
requests:
|
||||||
|
cpu: 4m
|
||||||
|
memory: 100Mi
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 2000
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
serviceAccountName: alertmanager-main
|
||||||
|
version: 0.25.0
|
||||||
75
templates/ClusterPolicy-Prometheus-Role.yaml
Normal file
75
templates/ClusterPolicy-Prometheus-Role.yaml
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }}
|
||||||
|
---
|
||||||
|
apiVersion: kyverno.io/v1
|
||||||
|
kind: ClusterPolicy
|
||||||
|
metadata:
|
||||||
|
name: add-prometheus-role
|
||||||
|
annotations:
|
||||||
|
policies.kyverno.io/title: Add Prometheus Role
|
||||||
|
policies.kyverno.io/category: Monitoring
|
||||||
|
policies.kyverno.io/subject: RoleBinding
|
||||||
|
policies.kyverno.io/minversion: 1.6.0
|
||||||
|
policies.kyverno.io/description: >-
|
||||||
|
This policy is responsible for ensuring that a Role for the prometheus
|
||||||
|
monitoring instances is created to enable monitoring of the namespace in
|
||||||
|
question.
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
background: true
|
||||||
|
generateExisting: true
|
||||||
|
rules:
|
||||||
|
- name: generate-prometheus-role
|
||||||
|
match:
|
||||||
|
any:
|
||||||
|
- resources:
|
||||||
|
kinds:
|
||||||
|
- Namespace
|
||||||
|
generate:
|
||||||
|
synchronize: true
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: "{{ `{{` }}request.object.metadata.name }}"
|
||||||
|
data:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- services
|
||||||
|
- endpoints
|
||||||
|
- pods
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- extensions
|
||||||
|
resources:
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- networking.k8s.io
|
||||||
|
resources:
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
|
||||||
|
{{ end }}
|
||||||
53
templates/ClusterPolicy-Prometheus-RoleBinding.yaml
Normal file
53
templates/ClusterPolicy-Prometheus-RoleBinding.yaml
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }}
|
||||||
|
---
|
||||||
|
apiVersion: kyverno.io/v1
|
||||||
|
kind: ClusterPolicy
|
||||||
|
metadata:
|
||||||
|
name: add-prometheus-role-binding
|
||||||
|
annotations:
|
||||||
|
policies.kyverno.io/title: Add Prometheus RoleBinding
|
||||||
|
policies.kyverno.io/category: Monitoring
|
||||||
|
policies.kyverno.io/subject: RoleBinding
|
||||||
|
policies.kyverno.io/minversion: 1.6.0
|
||||||
|
policies.kyverno.io/description: >-
|
||||||
|
This policy is responsible for ensuring that a RoleBinding for the prometheus
|
||||||
|
monitoring instances is created to enable monitoring of the namespace in
|
||||||
|
question.
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
background: true
|
||||||
|
generateExisting: true
|
||||||
|
rules:
|
||||||
|
- name: generate-prometheus-binding
|
||||||
|
match:
|
||||||
|
any:
|
||||||
|
- resources:
|
||||||
|
kinds:
|
||||||
|
- Namespace
|
||||||
|
generate:
|
||||||
|
synchronize: true
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: "{{ `{{` }}request.object.metadata.name }}"
|
||||||
|
data:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: prometheus-k8s
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}"
|
||||||
|
{{ end }}
|
||||||
42
templates/ClusterRole-GrafanaAgent.yaml
Normal file
42
templates/ClusterRole-GrafanaAgent.yaml
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: grafana-agent
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes
|
||||||
|
- nodes/proxy
|
||||||
|
- services
|
||||||
|
- endpoints
|
||||||
|
- pods
|
||||||
|
- events
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- nonResourceURLs:
|
||||||
|
- /metrics
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- apiGroups:
|
||||||
|
- authentication.k8s.io
|
||||||
|
resources:
|
||||||
|
- tokenreviews
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- authorization.k8s.io
|
||||||
|
resources:
|
||||||
|
- subjectaccessreviews
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
26
templates/ClusterRole-aggregated-metrics-reader.yaml
Normal file
26
templates/ClusterRole-aggregated-metrics-reader.yaml
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
rbac.authorization.k8s.io/aggregate-to-admin: "true"
|
||||||
|
rbac.authorization.k8s.io/aggregate-to-edit: "true"
|
||||||
|
rbac.authorization.k8s.io/aggregate-to-view: "true"
|
||||||
|
name: system:aggregated-metrics-reader
|
||||||
|
namespace: monitoring
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- metrics.k8s.io
|
||||||
|
resources:
|
||||||
|
- pods
|
||||||
|
- nodes
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: resource-metrics:system:auth-delegator
|
||||||
|
namespace: monitoring
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: system:auth-delegator
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: hpa-controller-custom-metrics
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: custom-metrics-server-resources
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: horizontal-pod-autoscaler
|
||||||
|
namespace: kube-system
|
||||||
20
templates/ClusterRole-binding-prometheus.yaml
Normal file
20
templates/ClusterRole-binding-prometheus.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: prometheus-adapter
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
18
templates/ClusterRole-grafana-SideCar.yaml
Normal file
18
templates/ClusterRole-grafana-SideCar.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}}
|
||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: Grafana-Sidecar
|
||||||
|
rules:
|
||||||
|
- apiGroups: [""]
|
||||||
|
resources: ["configmaps"]
|
||||||
|
verbs: ["get", "watch", "list"]
|
||||||
|
{{- end }}
|
||||||
21
templates/ClusterRole-kube-monitor-proxy.yaml
Normal file
21
templates/ClusterRole-kube-monitor-proxy.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
rules:
|
||||||
|
- apiGroups: ["authentication.k8s.io"]
|
||||||
|
resources:
|
||||||
|
- tokenreviews
|
||||||
|
verbs: ["create"]
|
||||||
|
- apiGroups: ["authorization.k8s.io"]
|
||||||
|
resources:
|
||||||
|
- subjectaccessreviews
|
||||||
|
verbs: ["create"]
|
||||||
20
templates/ClusterRole-kubeStateMetrics-1.yaml
Normal file
20
templates/ClusterRole-kubeStateMetrics-1.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-state-metrics
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: kube-state-metrics
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: kube-state-metrics
|
||||||
|
namespace: monitoring
|
||||||
132
templates/ClusterRole-kubeStateMetrics.yaml
Normal file
132
templates/ClusterRole-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-state-metrics
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
- secrets
|
||||||
|
- nodes
|
||||||
|
- pods
|
||||||
|
- services
|
||||||
|
- serviceaccounts
|
||||||
|
- resourcequotas
|
||||||
|
- replicationcontrollers
|
||||||
|
- limitranges
|
||||||
|
- persistentvolumeclaims
|
||||||
|
- persistentvolumes
|
||||||
|
- namespaces
|
||||||
|
- endpoints
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- apps
|
||||||
|
resources:
|
||||||
|
- statefulsets
|
||||||
|
- daemonsets
|
||||||
|
- deployments
|
||||||
|
- replicasets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- batch
|
||||||
|
resources:
|
||||||
|
- cronjobs
|
||||||
|
- jobs
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- autoscaling
|
||||||
|
resources:
|
||||||
|
- horizontalpodautoscalers
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- authentication.k8s.io
|
||||||
|
resources:
|
||||||
|
- tokenreviews
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- authorization.k8s.io
|
||||||
|
resources:
|
||||||
|
- subjectaccessreviews
|
||||||
|
verbs:
|
||||||
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- policy
|
||||||
|
resources:
|
||||||
|
- poddisruptionbudgets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- certificates.k8s.io
|
||||||
|
resources:
|
||||||
|
- certificatesigningrequests
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- discovery.k8s.io
|
||||||
|
resources:
|
||||||
|
- endpointslices
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- storage.k8s.io
|
||||||
|
resources:
|
||||||
|
- storageclasses
|
||||||
|
- volumeattachments
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- admissionregistration.k8s.io
|
||||||
|
resources:
|
||||||
|
- mutatingwebhookconfigurations
|
||||||
|
- validatingwebhookconfigurations
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- networking.k8s.io
|
||||||
|
resources:
|
||||||
|
- networkpolicies
|
||||||
|
- ingressclasses
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- coordination.k8s.io
|
||||||
|
resources:
|
||||||
|
- leases
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- rbac.authorization.k8s.io
|
||||||
|
resources:
|
||||||
|
- clusterrolebindings
|
||||||
|
- clusterroles
|
||||||
|
- rolebindings
|
||||||
|
- roles
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: resource-metrics-server-resources
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- metrics.k8s.io
|
||||||
|
resources:
|
||||||
|
- '*'
|
||||||
|
verbs:
|
||||||
|
- '*'
|
||||||
24
templates/ClusterRole-prometheus-adapter.yaml
Normal file
24
templates/ClusterRole-prometheus-adapter.yaml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-adapter
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes
|
||||||
|
- namespaces
|
||||||
|
- pods
|
||||||
|
- services
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
23
templates/ClusterRole-prometheus.yaml
Normal file
23
templates/ClusterRole-prometheus.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRole
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- nodes/metrics
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- nonResourceURLs:
|
||||||
|
- /metrics
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
20
templates/ClusterRoleBinding-Grafana-Agent.yaml
Normal file
20
templates/ClusterRoleBinding-Grafana-Agent.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: grafana-agent
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: grafana-agent
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: grafana-agent
|
||||||
|
namespace: monitoring
|
||||||
22
templates/ClusterRoleBinding-Grafana-SideCar.yaml
Normal file
22
templates/ClusterRoleBinding-Grafana-SideCar.yaml
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}}
|
||||||
|
---
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: Grafana-Sidecar
|
||||||
|
roleRef:
|
||||||
|
kind: ClusterRole
|
||||||
|
name: Grafana-Sidecar
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: grafana
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
|
||||||
|
{{- end }}
|
||||||
20
templates/ClusterRoleBinding-kube-monitor-proxy.yaml
Normal file
20
templates/ClusterRoleBinding-kube-monitor-proxy.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
namespace: monitoring
|
||||||
20
templates/ClusterRoleBinding-prometheus.yaml
Normal file
20
templates/ClusterRoleBinding-prometheus.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: ClusterRoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: ClusterRole
|
||||||
|
name: prometheus-k8s
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
130
templates/ConfigMap-Grafana.yaml
Normal file
130
templates/ConfigMap-Grafana.yaml
Normal file
@ -0,0 +1,130 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: grafana-config
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
data:
|
||||||
|
prometheus.yaml: |-
|
||||||
|
{
|
||||||
|
"apiVersion": 1,
|
||||||
|
"datasources": [
|
||||||
|
{
|
||||||
|
"access":"proxy",
|
||||||
|
"editable": true,
|
||||||
|
"name": "prometheus",
|
||||||
|
"orgId": 1,
|
||||||
|
"type": "prometheus",
|
||||||
|
"url": "http://prometheus-config-map.monitoring.svc:9090",
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
oncall-app.yaml: |-
|
||||||
|
apiVersion: 1
|
||||||
|
apps:
|
||||||
|
- type: grafana-oncall-app
|
||||||
|
name: grafana-oncall-app
|
||||||
|
disabled: false
|
||||||
|
jsonData:
|
||||||
|
#user: admin
|
||||||
|
grafanaUrl: http://grafana.monitoring.svc:3000
|
||||||
|
onCallApiUrl: http://oncall.monitoring.svc:8080
|
||||||
|
secureJsonData:
|
||||||
|
#password: admin
|
||||||
|
onCallInvitationToken: yaD3a6dkYV0CFinTlMPb42Mi9
|
||||||
|
#http://127.0.0.1:8080/integrations/v1/alertmanager/yaD3a6dkYV0CFinTlMPb42Mi9/
|
||||||
|
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: grafana-alertmanager
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
alertmanager.yaml: |-
|
||||||
|
{
|
||||||
|
"template_files": {},
|
||||||
|
"template_file_provenances": {},
|
||||||
|
"alertmanager_config": {
|
||||||
|
"route": {
|
||||||
|
"receiver": "Grafana Alerting 😊",
|
||||||
|
"group_by": [
|
||||||
|
"grafana_folder",
|
||||||
|
"alertname"
|
||||||
|
],
|
||||||
|
"routes": [
|
||||||
|
{
|
||||||
|
"receiver": "Grafana Alerting 😊",
|
||||||
|
"object_matchers": [
|
||||||
|
[
|
||||||
|
"severity",
|
||||||
|
"!=",
|
||||||
|
""
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"severity",
|
||||||
|
"=",
|
||||||
|
"critical"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"severity",
|
||||||
|
"=",
|
||||||
|
"info"
|
||||||
|
],
|
||||||
|
[
|
||||||
|
"severity",
|
||||||
|
"=",
|
||||||
|
"warning"
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"continue": true,
|
||||||
|
"group_wait": "1s",
|
||||||
|
"group_interval": "2s",
|
||||||
|
"repeat_interval": "15m"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"group_wait": "1s",
|
||||||
|
"group_interval": "2s",
|
||||||
|
"repeat_interval": "3m"
|
||||||
|
},
|
||||||
|
"templates": null,
|
||||||
|
"receivers": [
|
||||||
|
{
|
||||||
|
"name": "Grafana Alerting 😊",
|
||||||
|
"grafana_managed_receiver_configs": [
|
||||||
|
{
|
||||||
|
"uid": "4dqvJSfVkz",
|
||||||
|
"name": "Grafana Alerting 😊",
|
||||||
|
"type": "webhook",
|
||||||
|
"disableResolveMessage": false,
|
||||||
|
"settings": {
|
||||||
|
"httpMethod": "POST",
|
||||||
|
"url": "http://host.docker.internal:8080/integrations/v1/grafana_alerting/dXStYoK9Z15VZW8R8AtfyIwtu/"
|
||||||
|
},
|
||||||
|
"secureFields": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
---
|
||||||
284
templates/ConfigMap-GrafanaAgent.yaml
Normal file
284
templates/ConfigMap-GrafanaAgent.yaml
Normal file
@ -0,0 +1,284 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: grafana-agent
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
agent.yaml: |
|
||||||
|
metrics:
|
||||||
|
wal_directory: /tmp/wal
|
||||||
|
|
||||||
|
logs:
|
||||||
|
positions_directory: "/tmp"
|
||||||
|
|
||||||
|
configs:
|
||||||
|
|
||||||
|
- name: node-logs
|
||||||
|
clients:
|
||||||
|
- url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push
|
||||||
|
backoff_config:
|
||||||
|
min_period: 10s
|
||||||
|
max_period: 5m
|
||||||
|
max_retries: 10
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
readline_rate_enabled: true
|
||||||
|
readline_rate: 250
|
||||||
|
readline_burst: 750
|
||||||
|
readline_rate_drop: false
|
||||||
|
max_streams: 500
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
|
||||||
|
- job_name: systemd-journal
|
||||||
|
journal:
|
||||||
|
path: /host/root/run/log/journal
|
||||||
|
json: true
|
||||||
|
max_age: 24h
|
||||||
|
relabel_configs:
|
||||||
|
|
||||||
|
- source_labels:
|
||||||
|
- __journal__systemd_unit
|
||||||
|
target_label: systemd_unit
|
||||||
|
- source_labels:
|
||||||
|
- __journal__hostname
|
||||||
|
target_label: node
|
||||||
|
- source_labels:
|
||||||
|
- __journal_syslog_identifier
|
||||||
|
target_label: syslog_identifier
|
||||||
|
- target_label: "job"
|
||||||
|
replacement: "systemd-journal"
|
||||||
|
- target_label: "job_name"
|
||||||
|
replacement: "journal-logs"
|
||||||
|
- target_label: hostname
|
||||||
|
replacement: "${HOSTNAME}"
|
||||||
|
|
||||||
|
pipeline_stages:
|
||||||
|
- json:
|
||||||
|
expressions:
|
||||||
|
pid: _PID
|
||||||
|
userId: _UID
|
||||||
|
application: _COMM
|
||||||
|
priority: PRIORITY
|
||||||
|
|
||||||
|
- labels:
|
||||||
|
application:
|
||||||
|
#level:
|
||||||
|
pid:
|
||||||
|
userId:
|
||||||
|
priority:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: '{{"{{"}} ToLower .Value {{"}}"}}'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="7"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'debug'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="6"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'info'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="5"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'notice'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="4"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'warning'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="3"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'error'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="2"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'crit'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="1"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'alert'
|
||||||
|
- match:
|
||||||
|
selector: '{priority="0"}'
|
||||||
|
stages:
|
||||||
|
- template:
|
||||||
|
source: level
|
||||||
|
template: 'emerg'
|
||||||
|
- labels:
|
||||||
|
level:
|
||||||
|
|
||||||
|
|
||||||
|
- job_name: var-logs
|
||||||
|
static_configs:
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'aptitude'
|
||||||
|
__path__: /var/log/apt/*.log
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'dpkg'
|
||||||
|
__path__: /var/log/dpkg.log
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'messages'
|
||||||
|
__path__: /var/log/messages
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'syslog'
|
||||||
|
__path__: /var/log/syslog
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'kernlog'
|
||||||
|
__path__: /var/log/kern.log
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'auth-log'
|
||||||
|
__path__: /var/log/auth.log
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'boot-log'
|
||||||
|
__path__: /var/log/boot.log
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'daemon-log'
|
||||||
|
__path__: /var/log/daemon.log
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'faillog-log'
|
||||||
|
__path__: /var/log/faillog
|
||||||
|
- targets: [localhost]
|
||||||
|
labels:
|
||||||
|
job_name: 'lastlog-log'
|
||||||
|
__path__: /var/log/lastlog
|
||||||
|
|
||||||
|
relabel_configs:
|
||||||
|
- target_label: hostname
|
||||||
|
replacement: "${HOSTNAME}"
|
||||||
|
- target_label: job
|
||||||
|
replacement: log-files
|
||||||
|
- target_label: node
|
||||||
|
replacement: "${HOSTNAME}"
|
||||||
|
|
||||||
|
pipeline_stages:
|
||||||
|
- timestamp:
|
||||||
|
format: RFC3339
|
||||||
|
source: time
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
- job_name: pod-logs
|
||||||
|
kubernetes_sd_configs:
|
||||||
|
- role: pod
|
||||||
|
|
||||||
|
pipeline_stages:
|
||||||
|
- cri: {}
|
||||||
|
|
||||||
|
- regex:
|
||||||
|
expression: '(\s|\t|level=)?(?P<level>trace|debug|info|warn|error|TRACE|DEBUG|INFO|WARN|ERROR)(\s|\t)'
|
||||||
|
|
||||||
|
- labels:
|
||||||
|
level:
|
||||||
|
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
target_label: __host__
|
||||||
|
- action: labelmap
|
||||||
|
regex: __meta_kubernetes_pod_label_(.+)
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_namespace
|
||||||
|
target_label: namespace
|
||||||
|
- target_label: job
|
||||||
|
replacement: kubernetes_sd
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_name
|
||||||
|
target_label: pod
|
||||||
|
- action: replace
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_container_name
|
||||||
|
target_label: container
|
||||||
|
- replacement: /var/log/pods/*$1/*.log
|
||||||
|
separator: /
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_uid
|
||||||
|
- __meta_kubernetes_pod_container_name
|
||||||
|
target_label: __path__
|
||||||
|
- target_label: "job_name"
|
||||||
|
replacement: "kubernetes-logs"
|
||||||
|
- target_label: hostname
|
||||||
|
replacement: "${HOSTNAME}"
|
||||||
|
- target_label: node
|
||||||
|
source_labels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
|
||||||
|
|
||||||
|
integrations:
|
||||||
|
|
||||||
|
node_exporter:
|
||||||
|
enabled: true
|
||||||
|
rootfs_path: /host/root
|
||||||
|
sysfs_path: /host/sys
|
||||||
|
procfs_path: /host/proc
|
||||||
|
udev_data_path: /host/root/run/udev/data
|
||||||
|
|
||||||
|
# collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/)
|
||||||
|
filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)"
|
||||||
|
#filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$"
|
||||||
|
filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
|
||||||
|
|
||||||
|
|
||||||
|
netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$"
|
||||||
|
netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$"
|
||||||
|
|
||||||
|
scrape_integration: true
|
||||||
|
|
||||||
|
include_exporter_metrics: true
|
||||||
|
enable_collectors:
|
||||||
|
- uname
|
||||||
|
|
||||||
|
|
||||||
|
syslog_server.yaml: |
|
||||||
|
# REF: https://grafana.com/docs/loki/latest/send-data/promtail/configuration/#example-syslog-config
|
||||||
|
server:
|
||||||
|
http_listen_port: 9080
|
||||||
|
grpc_listen_port: 0
|
||||||
|
|
||||||
|
positions:
|
||||||
|
filename: /tmp/syslog_server_positions.yaml
|
||||||
|
|
||||||
|
clients:
|
||||||
|
- url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: syslog
|
||||||
|
syslog:
|
||||||
|
listen_address: 0.0.0.0:1514
|
||||||
|
labels:
|
||||||
|
job: "syslog"
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: ['__syslog_message_hostname']
|
||||||
|
target_label: 'host'
|
||||||
31
templates/ConfigMap-GrafanaProvisioning.yaml
Normal file
31
templates/ConfigMap-GrafanaProvisioning.yaml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}}
|
||||||
|
---
|
||||||
|
# Provisioning config
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: provisioning-config
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
|
||||||
|
data:
|
||||||
|
provisioning.yaml: |-
|
||||||
|
apiVersion: 1
|
||||||
|
providers:
|
||||||
|
- name: 'configmap-dashboard-provider'
|
||||||
|
orgId: 1
|
||||||
|
folder: ''
|
||||||
|
folderUid: ''
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
updateIntervalSeconds: 10
|
||||||
|
allowUiUpdates: false
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards
|
||||||
|
foldersFromFilesStructure: true
|
||||||
|
{{- end }}
|
||||||
72
templates/ConfigMap-prometheus-adapter.yaml
Normal file
72
templates/ConfigMap-prometheus-adapter.yaml
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: adapter-config
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
config.yaml: |-
|
||||||
|
"resourceRules":
|
||||||
|
"cpu":
|
||||||
|
"containerLabel": "container"
|
||||||
|
"containerQuery": |
|
||||||
|
sum by (<<.GroupBy>>) (
|
||||||
|
irate (
|
||||||
|
container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
"nodeQuery": |
|
||||||
|
sum by (<<.GroupBy>>) (
|
||||||
|
1 - irate(
|
||||||
|
node_cpu_seconds_total{mode="idle"}[60s]
|
||||||
|
)
|
||||||
|
* on(namespace, pod) group_left(node) (
|
||||||
|
node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
or sum by (<<.GroupBy>>) (
|
||||||
|
1 - irate(
|
||||||
|
windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
"resources":
|
||||||
|
"overrides":
|
||||||
|
"namespace":
|
||||||
|
"resource": "namespace"
|
||||||
|
"node":
|
||||||
|
"resource": "node"
|
||||||
|
"pod":
|
||||||
|
"resource": "pod"
|
||||||
|
"memory":
|
||||||
|
"containerLabel": "container"
|
||||||
|
"containerQuery": |
|
||||||
|
sum by (<<.GroupBy>>) (
|
||||||
|
container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""}
|
||||||
|
)
|
||||||
|
"nodeQuery": |
|
||||||
|
sum by (<<.GroupBy>>) (
|
||||||
|
node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>}
|
||||||
|
-
|
||||||
|
node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}
|
||||||
|
)
|
||||||
|
or sum by (<<.GroupBy>>) (
|
||||||
|
windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>}
|
||||||
|
-
|
||||||
|
windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>}
|
||||||
|
)
|
||||||
|
"resources":
|
||||||
|
"overrides":
|
||||||
|
"instance":
|
||||||
|
"resource": "node"
|
||||||
|
"namespace":
|
||||||
|
"resource": "namespace"
|
||||||
|
"pod":
|
||||||
|
"resource": "pod"
|
||||||
|
"window": "5m"
|
||||||
138
templates/Daemonset-GrafanaAgent.yaml
Normal file
138
templates/Daemonset-GrafanaAgent.yaml
Normal file
@ -0,0 +1,138 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
metricsJob: node-exporter
|
||||||
|
cadvisormetricsJob: cadvisor
|
||||||
|
nodeExportermetricsJob: node
|
||||||
|
name: grafana-agent
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.grafana_agent.namespace }}"
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
metricsJob: node-exporter
|
||||||
|
cadvisormetricsJob: cadvisor
|
||||||
|
nodeExportermetricsJob: node
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
metricsJob: node-exporter
|
||||||
|
cadvisormetricsJob: cadvisor
|
||||||
|
nodeExportermetricsJob: node
|
||||||
|
spec:
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
containers:
|
||||||
|
- args:
|
||||||
|
- --server.http.address=0.0.0.0:12345
|
||||||
|
- --config.file=/etc/agent/agent.yaml
|
||||||
|
- --config.expand-env=true
|
||||||
|
name: grafana-agent
|
||||||
|
image: "{{ .Values.nfc_monitoring.grafana_agent.image.name }}:{{ .Values.nfc_monitoring.grafana_agent.image.tag }}"
|
||||||
|
#imagePullPolicy: Never
|
||||||
|
ports:
|
||||||
|
- containerPort: 12345
|
||||||
|
name: grafana-metrics
|
||||||
|
protocol: TCP
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 180Mi
|
||||||
|
requests:
|
||||||
|
cpu: 40m
|
||||||
|
memory: 180Mi
|
||||||
|
securityContext:
|
||||||
|
capabilities:
|
||||||
|
add:
|
||||||
|
- SYS_TIME
|
||||||
|
# drop:
|
||||||
|
# - ALL
|
||||||
|
readOnlyRootFilesystem: false
|
||||||
|
privileged: true
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /host/sys
|
||||||
|
mountPropagation: HostToContainer
|
||||||
|
name: sys
|
||||||
|
readOnly: true
|
||||||
|
- mountPath: /host/proc
|
||||||
|
mountPropagation: HostToContainer
|
||||||
|
name: proc
|
||||||
|
readOnly: true
|
||||||
|
- mountPath: /host/root
|
||||||
|
mountPropagation: HostToContainer
|
||||||
|
name: rootfs
|
||||||
|
readOnly: true
|
||||||
|
- mountPath: /var/log
|
||||||
|
mountPropagation: HostToContainer
|
||||||
|
name: logs
|
||||||
|
readOnly: true
|
||||||
|
- name: config
|
||||||
|
mountPath: "/etc/agent"
|
||||||
|
readOnly: false
|
||||||
|
- name: temp
|
||||||
|
mountPath: "/tmp"
|
||||||
|
readOnly: false
|
||||||
|
- name: agent-data
|
||||||
|
mountPath: "/etc/agent/data"
|
||||||
|
readOnly: false
|
||||||
|
dnsPolicy: ClusterFirst
|
||||||
|
volumes:
|
||||||
|
- hostPath:
|
||||||
|
path: /sys
|
||||||
|
name: sys
|
||||||
|
- hostPath:
|
||||||
|
path: /proc
|
||||||
|
name: proc
|
||||||
|
- hostPath:
|
||||||
|
path: /
|
||||||
|
name: rootfs
|
||||||
|
- hostPath:
|
||||||
|
path: /var/log
|
||||||
|
name: logs
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: grafana-agent
|
||||||
|
items:
|
||||||
|
- key: "agent.yaml"
|
||||||
|
path: "agent.yaml"
|
||||||
|
- name: temp
|
||||||
|
emptyDir: {}
|
||||||
|
- name: agent-data
|
||||||
|
emptyDir: {}
|
||||||
|
|
||||||
|
- name: var-run
|
||||||
|
hostPath:
|
||||||
|
path: /var/run
|
||||||
|
- name: containerd
|
||||||
|
hostPath:
|
||||||
|
path: /var/lib/contairnerd
|
||||||
|
- name: disk
|
||||||
|
hostPath:
|
||||||
|
path: /dev/disk
|
||||||
|
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
hostNetwork: true
|
||||||
|
hostPID: true
|
||||||
|
priorityClassName: system-cluster-critical
|
||||||
|
serviceAccountName: grafana-agent
|
||||||
|
tolerations:
|
||||||
|
- operator: Exists
|
||||||
136
templates/Daemonset-kube-monitor-proxy.yaml
Normal file
136
templates/Daemonset-kube-monitor-proxy.yaml
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: DaemonSet
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
|
||||||
|
nodeExportermetricsJob: node
|
||||||
|
name_kcm: kube-controller-manager
|
||||||
|
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.kube_monitor_proxy.namespace }}"
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
hostNetwork: true
|
||||||
|
affinity:
|
||||||
|
nodeAffinity:
|
||||||
|
requiredDuringSchedulingIgnoredDuringExecution:
|
||||||
|
nodeSelectorTerms:
|
||||||
|
- matchExpressions:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
containers:
|
||||||
|
|
||||||
|
- args:
|
||||||
|
- --logtostderr
|
||||||
|
- --v=10
|
||||||
|
- --secure-listen-address=[$(IP)]:11257
|
||||||
|
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||||
|
- --upstream=https://127.0.0.1:10257/
|
||||||
|
- --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
- --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
env:
|
||||||
|
- name: NODE-IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
apiVersion: v1
|
||||||
|
fieldPath: status.hostIP
|
||||||
|
- name: IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: status.podIP
|
||||||
|
image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}"
|
||||||
|
name: kube-rbac-proxy-kube-ctrl-mgr
|
||||||
|
ports:
|
||||||
|
- containerPort: 10257
|
||||||
|
#hostPort: 9100
|
||||||
|
name: kube-ctrl-mgr
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 256Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 20Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsGroup: 65532
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65532
|
||||||
|
|
||||||
|
|
||||||
|
- args:
|
||||||
|
- --logtostderr
|
||||||
|
- --v=10
|
||||||
|
- --secure-listen-address=[$(IP)]:11259
|
||||||
|
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||||
|
- --upstream=https://127.0.0.1:10259/
|
||||||
|
- --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
- --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
env:
|
||||||
|
- name: NODE-IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
apiVersion: v1
|
||||||
|
fieldPath: status.hostIP
|
||||||
|
- name: IP
|
||||||
|
valueFrom:
|
||||||
|
fieldRef:
|
||||||
|
fieldPath: status.podIP
|
||||||
|
image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}"
|
||||||
|
name: kube-rbac-proxy-kube-scheduler
|
||||||
|
ports:
|
||||||
|
- containerPort: 10259
|
||||||
|
#hostPort: 9100
|
||||||
|
name: kube-scheduler
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 256Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 20Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsGroup: 65532
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65532
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
hostPID: true
|
||||||
|
priorityClassName: system-cluster-critical
|
||||||
|
serviceAccountName: kube-monitor-proxy
|
||||||
|
tolerations:
|
||||||
|
- operator: Exists
|
||||||
112
templates/Deployment-kubeStateMetrics.yaml
Normal file
112
templates/Deployment-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-state-metrics
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.kube_state_metrics.namespace }}"
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
kubectl.kubernetes.io/default-container: kube-state-metrics
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
containers:
|
||||||
|
- args:
|
||||||
|
- --host=127.0.0.1
|
||||||
|
- --port=8081
|
||||||
|
- --telemetry-host=127.0.0.1
|
||||||
|
- --telemetry-port=8082
|
||||||
|
image: "{{ .Values.nfc_monitoring.kube_state_metrics.image.name }}:{{ .Values.nfc_monitoring.kube_state_metrics.image.tag }}"
|
||||||
|
name: kube-state-metrics
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 200m
|
||||||
|
memory: 250Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 190Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsUser: 65534
|
||||||
|
- args:
|
||||||
|
- --logtostderr
|
||||||
|
- --secure-listen-address=:8443
|
||||||
|
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||||
|
- --upstream=http://127.0.0.1:8081/
|
||||||
|
image: quay.io/brancz/kube-rbac-proxy:v0.14.0
|
||||||
|
name: kube-rbac-proxy-main
|
||||||
|
ports:
|
||||||
|
- containerPort: 8443
|
||||||
|
name: https-main
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 40Mi
|
||||||
|
requests:
|
||||||
|
cpu: 50m
|
||||||
|
memory: 20Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsGroup: 65532
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65532
|
||||||
|
- args:
|
||||||
|
- --logtostderr
|
||||||
|
- --secure-listen-address=:9443
|
||||||
|
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
|
||||||
|
- --upstream=http://127.0.0.1:8082/
|
||||||
|
image: quay.io/brancz/kube-rbac-proxy:v0.14.0
|
||||||
|
name: kube-rbac-proxy-self
|
||||||
|
ports:
|
||||||
|
- containerPort: 9443
|
||||||
|
name: https-self
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 20m
|
||||||
|
memory: 40Mi
|
||||||
|
requests:
|
||||||
|
cpu: 10m
|
||||||
|
memory: 20Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
runAsGroup: 65532
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65532
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
serviceAccountName: kube-state-metrics
|
||||||
102
templates/Deployment-prometheus-adapter.yaml
Normal file
102
templates/Deployment-prometheus-adapter.yaml
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.prometheus_adaptor.namespace }}"
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 1
|
||||||
|
maxUnavailable: 1
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
{{- toYaml .Values.nfc_monitoring.prometheus_adaptor.affinity | nindent 8 }}
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
containers:
|
||||||
|
- args:
|
||||||
|
- --cert-dir=/var/run/serving-cert
|
||||||
|
- --config=/etc/adapter/config.yaml
|
||||||
|
- --metrics-relist-interval=1m
|
||||||
|
- --prometheus-url=https://prometheus.monitoring.svc:9090/
|
||||||
|
- --secure-port=6443
|
||||||
|
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
|
||||||
|
image: "{{ .Values.nfc_monitoring.prometheus_adaptor.image.name }}:{{ .Values.nfc_monitoring.prometheus_adaptor.image.tag }}"
|
||||||
|
livenessProbe:
|
||||||
|
failureThreshold: 5
|
||||||
|
httpGet:
|
||||||
|
path: /livez
|
||||||
|
port: https
|
||||||
|
scheme: HTTPS
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 5
|
||||||
|
name: prometheus-adapter
|
||||||
|
ports:
|
||||||
|
- containerPort: 6443
|
||||||
|
name: https
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 5
|
||||||
|
httpGet:
|
||||||
|
path: /readyz
|
||||||
|
port: https
|
||||||
|
scheme: HTTPS
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 5
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
cpu: 102m
|
||||||
|
memory: 180Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
capabilities:
|
||||||
|
drop:
|
||||||
|
- ALL
|
||||||
|
readOnlyRootFilesystem: true
|
||||||
|
terminationMessagePolicy: FallbackToLogsOnError
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /tmp
|
||||||
|
name: tmpfs
|
||||||
|
readOnly: false
|
||||||
|
- mountPath: /var/run/serving-cert
|
||||||
|
name: volume-serving-cert
|
||||||
|
readOnly: false
|
||||||
|
- mountPath: /etc/adapter
|
||||||
|
name: config
|
||||||
|
readOnly: false
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
securityContext: {}
|
||||||
|
serviceAccountName: prometheus-adapter
|
||||||
|
volumes:
|
||||||
|
- emptyDir: {}
|
||||||
|
name: tmpfs
|
||||||
|
- emptyDir: {}
|
||||||
|
name: volume-serving-cert
|
||||||
|
- configMap:
|
||||||
|
name: adapter-config
|
||||||
|
name: config
|
||||||
163
templates/Grafana-Grafana.yaml
Normal file
163
templates/Grafana-Grafana.yaml
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
---
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: Grafana
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: grafana
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
|
||||||
|
spec:
|
||||||
|
config:
|
||||||
|
log:
|
||||||
|
mode: "console"
|
||||||
|
auth:
|
||||||
|
disable_login_form: "false"
|
||||||
|
security:
|
||||||
|
admin_user: "{{ .Values.nfc_monitoring.grafana.admin_user }}"
|
||||||
|
admin_password: "{{ .Values.nfc_monitoring.grafana.admin_password }}"
|
||||||
|
deployment:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
replicas: {{ .Values.nfc_monitoring.grafana.replicas | int }}
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
strategy:
|
||||||
|
rollingUpdate:
|
||||||
|
maxSurge: 1
|
||||||
|
maxUnavailable: 1
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
{{- toYaml .Values.nfc_monitoring.grafana.affinity | nindent 12 }}
|
||||||
|
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }}
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
{{ end }}
|
||||||
|
containers:
|
||||||
|
- name: grafana
|
||||||
|
image: "{{ .Values.nfc_monitoring.grafana.image.name }}:{{ .Values.nfc_monitoring.grafana.image.tag }}"
|
||||||
|
env:
|
||||||
|
- name: GF_INSTALL_PLUGINS
|
||||||
|
value: agenty-flowcharting-panel 0.9.1, ddurieux-glpi-app 1.3.1, grafana-oncall-app
|
||||||
|
- name: GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS
|
||||||
|
value: grafana-oncall-app 1.1.38
|
||||||
|
- name: GF_ANALYTICS_REPORTING_ENABLED
|
||||||
|
value: 'false'
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
|
ports:
|
||||||
|
- containerPort: 3000
|
||||||
|
name: grafana-http
|
||||||
|
readinessProbe:
|
||||||
|
failureThreshold: 3
|
||||||
|
httpGet:
|
||||||
|
path: /login
|
||||||
|
port: 3000
|
||||||
|
scheme: HTTP
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 10
|
||||||
|
successThreshold: 1
|
||||||
|
timeoutSeconds: 3
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
cpu: 2000m
|
||||||
|
memory: 1Gi
|
||||||
|
requests:
|
||||||
|
cpu: 100m
|
||||||
|
memory: 100Mi
|
||||||
|
securityContext:
|
||||||
|
allowPrivilegeEscalation: false
|
||||||
|
readOnlyRootFilesystem: false
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /var/lib/grafana
|
||||||
|
name: grafana-storage
|
||||||
|
- mountPath: /etc/grafana/provisioning/plugins
|
||||||
|
name: plugin-config
|
||||||
|
readOnly: false
|
||||||
|
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }}
|
||||||
|
- mountPath: /etc/grafana/provisioning/dashboards
|
||||||
|
name: provisioning-config
|
||||||
|
- mountPath: /var/lib/grafana/dashboards
|
||||||
|
name: dashboards
|
||||||
|
|
||||||
|
- image: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.name }}:{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.tag}}"
|
||||||
|
name: k8s-sidecar
|
||||||
|
env:
|
||||||
|
- name: LABEL
|
||||||
|
value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_name }}"
|
||||||
|
- name: LABEL_VALUE
|
||||||
|
value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_value }}"
|
||||||
|
- name: FOLDER
|
||||||
|
value: /var/lib/grafana/dashboards
|
||||||
|
- name: NAMESPACE
|
||||||
|
value: grafana
|
||||||
|
- name: RESOURCE
|
||||||
|
value: configmap
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /var/lib/grafana/dashboards
|
||||||
|
name: dashboards
|
||||||
|
{{ end }}
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 65534
|
||||||
|
volumes:
|
||||||
|
#- name: grafana-storage
|
||||||
|
- name: plugin-config
|
||||||
|
configMap:
|
||||||
|
# Provide the name of the ConfigMap you want to mount.
|
||||||
|
name: grafana-config
|
||||||
|
# An array of keys from the ConfigMap to create as files
|
||||||
|
items:
|
||||||
|
- key: "oncall-app.yaml"
|
||||||
|
path: "oncall-app.yaml"
|
||||||
|
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }}
|
||||||
|
- name: dashboards
|
||||||
|
emptyDir: {}
|
||||||
|
- name: provisioning-config
|
||||||
|
configMap:
|
||||||
|
name: provisioning-config
|
||||||
|
{{ end }}
|
||||||
|
- name: grafana-storage
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: grafana-pvc
|
||||||
|
serviceAccountName: grafana
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
|
||||||
|
persistentVolumeClaim:
|
||||||
|
metadata:
|
||||||
|
annotations:
|
||||||
|
pv.beta.kubernetes.io/gid: "65534"
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- "ReadWriteMany"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: "5Gi"
|
||||||
18
templates/GrafanaDashboard-AlertManager.yaml
Normal file
18
templates/GrafanaDashboard-AlertManager.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: alertmanager
|
||||||
|
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
|
||||||
|
spec:
|
||||||
|
allowCrossNamespaceImport: true
|
||||||
|
folder: No Fuss Monitoring
|
||||||
|
resyncPeriod: 1d
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
grafanaCom:
|
||||||
|
id: 9578
|
||||||
|
revision: 4 # as @ 19-09-23
|
||||||
21
templates/GrafanaDashboard-Ceph.yaml
Normal file
21
templates/GrafanaDashboard-Ceph.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}}
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: ceph
|
||||||
|
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
|
||||||
|
spec:
|
||||||
|
allowCrossNamespaceImport: true
|
||||||
|
folder: No Fuss Monitoring
|
||||||
|
resyncPeriod: 1d
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
grafanaCom:
|
||||||
|
id: 2842
|
||||||
|
revision: 17 # as @ 19-09-23
|
||||||
|
|
||||||
|
{{- end -}}
|
||||||
19
templates/GrafanaDashboard-cluster-summary.yaml
Normal file
19
templates/GrafanaDashboard-cluster-summary.yaml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: cluster-summary
|
||||||
|
namespace: grafana
|
||||||
|
spec:
|
||||||
|
allowCrossNamespaceImport: false
|
||||||
|
folder: No Fuss Monitoring
|
||||||
|
resyncPeriod: 30s
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
{{ $Dashboard := .Files.Get "files/dashboard-summary.json" | fromJson }}
|
||||||
|
json: >-
|
||||||
|
{{ $Dashboard | toRawJson }}
|
||||||
|
|
||||||
18
templates/GrafanaDashboard-node-exporter-full.yaml
Normal file
18
templates/GrafanaDashboard-node-exporter-full.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDashboard
|
||||||
|
metadata:
|
||||||
|
name: node-exporter
|
||||||
|
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
|
||||||
|
spec:
|
||||||
|
allowCrossNamespaceImport: true
|
||||||
|
folder: No Fuss Monitoring
|
||||||
|
resyncPeriod: 1d
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
grafanaCom:
|
||||||
|
id: 1860
|
||||||
|
revision: 32 # as @ 19-09-23
|
||||||
28
templates/GrafanaDatasources.yaml
Normal file
28
templates/GrafanaDatasources.yaml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
---
|
||||||
|
apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDatasourceList
|
||||||
|
items:
|
||||||
|
{{ range .Values.nfc_monitoring.grafana.DataSources }}
|
||||||
|
- apiVersion: grafana.integreatly.org/v1beta1
|
||||||
|
kind: GrafanaDatasource
|
||||||
|
metadata:
|
||||||
|
name: {{ .name }}
|
||||||
|
namespace: "{{ $.Values.nfc_monitoring.grafana.namespace }}"
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: dashboard
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
instanceSelector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
allowCrossNamespaceImport: true
|
||||||
|
datasource:
|
||||||
|
{{ toYaml . | nindent 8 }}
|
||||||
|
|
||||||
|
{{ end }}
|
||||||
31
templates/NetworkPolicy-kubeStateMetrics.yaml
Normal file
31
templates/NetworkPolicy-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
# apiVersion: networking.k8s.io/v1
|
||||||
|
# kind: NetworkPolicy
|
||||||
|
# metadata:
|
||||||
|
# labels:
|
||||||
|
# app.kubernetes.io/component: exporter
|
||||||
|
# app.kubernetes.io/name: kube-state-metrics
|
||||||
|
# app.kubernetes.io/part-of: kube-prometheus
|
||||||
|
# app.kubernetes.io/version: 2.8.1
|
||||||
|
# name: kube-state-metrics
|
||||||
|
# namespace: monitoring
|
||||||
|
# spec:
|
||||||
|
# egress:
|
||||||
|
# - {}
|
||||||
|
# ingress:
|
||||||
|
# - from:
|
||||||
|
# - podSelector:
|
||||||
|
# matchLabels:
|
||||||
|
# app.kubernetes.io/name: prometheus
|
||||||
|
# ports:
|
||||||
|
# - port: 8443
|
||||||
|
# protocol: TCP
|
||||||
|
# - port: 9443
|
||||||
|
# protocol: TCP
|
||||||
|
# podSelector:
|
||||||
|
# matchLabels:
|
||||||
|
# app.kubernetes.io/component: exporter
|
||||||
|
# app.kubernetes.io/name: kube-state-metrics
|
||||||
|
# app.kubernetes.io/part-of: kube-prometheus
|
||||||
|
# policyTypes:
|
||||||
|
# - Egress
|
||||||
|
# - Ingress
|
||||||
21
templates/PodDisruptionBudget-alertmanager.yaml
Normal file
21
templates/PodDisruptionBudget-alertmanager.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
apiVersion: policy/v1
|
||||||
|
kind: PodDisruptionBudget
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: alertmanager-main
|
||||||
|
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||||
|
spec:
|
||||||
|
maxUnavailable: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
21
templates/PodDisruptionBudget-prometheus-adapter.yaml
Normal file
21
templates/PodDisruptionBudget-prometheus-adapter.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
apiVersion: policy/v1
|
||||||
|
kind: PodDisruptionBudget
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
minAvailable: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
20
templates/PodDisruptionBudget-prometheus.yaml
Normal file
20
templates/PodDisruptionBudget-prometheus.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: policy/v1
|
||||||
|
kind: PodDisruptionBudget
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
minAvailable: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
64
templates/Prometheus-prometheus.yaml
Normal file
64
templates/Prometheus-prometheus.yaml
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: Prometheus
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: k8s
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}"
|
||||||
|
spec:
|
||||||
|
affinity:
|
||||||
|
{{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }}
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- apiVersion: v2
|
||||||
|
name: alertmanager-main
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}"
|
||||||
|
port: web
|
||||||
|
enableFeatures: []
|
||||||
|
externalLabels: {}
|
||||||
|
image: quay.io/prometheus/prometheus:v2.42.0
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/os: linux
|
||||||
|
podMetadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
podMonitorNamespaceSelector: {}
|
||||||
|
podMonitorSelector: {}
|
||||||
|
probeNamespaceSelector: {}
|
||||||
|
probeSelector: {}
|
||||||
|
replicas: 3
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 400Mi
|
||||||
|
ruleNamespaceSelector: {}
|
||||||
|
ruleSelector: {}
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 2000
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 1000
|
||||||
|
serviceAccountName: prometheus-k8s
|
||||||
|
serviceMonitorNamespaceSelector: {}
|
||||||
|
serviceMonitorSelector: {}
|
||||||
|
storage:
|
||||||
|
{{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }}
|
||||||
|
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
|
||||||
|
thanos:
|
||||||
|
image: "{{ .Values.nfc_monitoring.thanos.image.name }}:{{ .Values.nfc_monitoring.thanos.image.tag }}"
|
||||||
|
objectStorageConfig:
|
||||||
|
key: thanos.yaml
|
||||||
|
name: thanos-sidecar-config
|
||||||
|
{{ end }}
|
||||||
|
version: 2.42.0
|
||||||
|
{{ if .Values.nfc_monitoring.prometheus.additional }}
|
||||||
|
{{ toYaml .Values.nfc_monitoring.prometheus.additional | nindent 2 }}
|
||||||
|
{{ end }}
|
||||||
141
templates/PrometheusRule-alertmanager.yaml
Normal file
141
templates/PrometheusRule-alertmanager.yaml
Normal file
@ -0,0 +1,141 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: alertmanager-main-rules
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: alertmanager.rules
|
||||||
|
rules:
|
||||||
|
- alert: AlertmanagerFailedReload
|
||||||
|
annotations:
|
||||||
|
description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
|
||||||
|
summary: Reloading an Alertmanager configuration has failed.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerMembersInconsistent
|
||||||
|
annotations:
|
||||||
|
description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
|
||||||
|
summary: A member of an Alertmanager cluster has not found all other cluster members.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||||
|
< on (namespace,service) group_left
|
||||||
|
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]))
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerFailedToSendAlerts
|
||||||
|
annotations:
|
||||||
|
description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
|
||||||
|
summary: An Alertmanager instance failed to send notifications.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||||
|
annotations:
|
||||||
|
description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||||
|
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
|
||||||
|
expr: |
|
||||||
|
min by (namespace,service, integration) (
|
||||||
|
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerClusterFailedToSendAlerts
|
||||||
|
annotations:
|
||||||
|
description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
|
||||||
|
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
|
||||||
|
expr: |
|
||||||
|
min by (namespace,service, integration) (
|
||||||
|
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: AlertmanagerConfigInconsistent
|
||||||
|
annotations:
|
||||||
|
description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
|
||||||
|
summary: Alertmanager instances within the same cluster have different configurations.
|
||||||
|
expr: |
|
||||||
|
count by (namespace,service) (
|
||||||
|
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"})
|
||||||
|
)
|
||||||
|
!= 1
|
||||||
|
for: 20m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerClusterDown
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
|
||||||
|
summary: Half or more of the Alertmanager instances within the same cluster are down.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
count by (namespace,service) (
|
||||||
|
avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (namespace,service) (
|
||||||
|
up{job="alertmanager-main",namespace="monitoring"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>= 0.5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: AlertmanagerClusterCrashlooping
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
|
||||||
|
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
count by (namespace,service) (
|
||||||
|
changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (namespace,service) (
|
||||||
|
up{job="alertmanager-main",namespace="monitoring"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>= 0.5
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
23
templates/PrometheusRule-grafana-agent.yaml
Normal file
23
templates/PrometheusRule-grafana-agent.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: grafana-agent-promtail
|
||||||
|
name: grafana-agent
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: grafana_agent
|
||||||
|
rules:
|
||||||
|
# - annotations:
|
||||||
|
# description: "As Grafana Agent is being used, it's version is set as promtails"
|
||||||
|
- expr: |
|
||||||
|
agent_build_info
|
||||||
|
record: promtail_build_info
|
||||||
35
templates/PrometheusRule-grafana.yaml
Normal file
35
templates/PrometheusRule-grafana.yaml
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: grafana
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: grafana-rules
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: GrafanaAlerts
|
||||||
|
rules:
|
||||||
|
- alert: GrafanaRequestsFailing
|
||||||
|
annotations:
|
||||||
|
message: '{{`{{`}} $labels.namespace }}/{{`{{`}} $labels.job }}/{{`{{`}} $labels.handler }} is experiencing {{`{{`}} $value | humanize }}% errors'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing
|
||||||
|
expr: |
|
||||||
|
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
|
||||||
|
/ ignoring (status_code)
|
||||||
|
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
|
||||||
|
> 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- name: grafana_rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
|
||||||
|
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
|
||||||
86
templates/PrometheusRule-kubePrometheus.yaml
Normal file
86
templates/PrometheusRule-kubePrometheus.yaml
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: kube-prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: kube-prometheus-rules
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: general.rules
|
||||||
|
rules:
|
||||||
|
- alert: TargetDown
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} printf "%.4g" $value }}% of the {{ `{{` }} $labels.job }}/{{ `{{` }} $labels.service }} targets in {{ `{{` }} $labels.namespace }} namespace are down.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||||
|
summary: One or more targets are unreachable.
|
||||||
|
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: Watchdog
|
||||||
|
annotations:
|
||||||
|
description: |
|
||||||
|
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||||
|
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||||
|
and always fire against a receiver. There are integrations with various notification
|
||||||
|
mechanisms that send a notification when this alert is not firing. For example the
|
||||||
|
"DeadMansSnitch" integration in PagerDuty.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||||
|
summary: An alert that should always be firing to certify that Alertmanager is working properly.
|
||||||
|
expr: vector(1)
|
||||||
|
labels:
|
||||||
|
severity: none
|
||||||
|
- alert: InfoInhibitor
|
||||||
|
annotations:
|
||||||
|
description: |
|
||||||
|
This is an alert that is used to inhibit info alerts.
|
||||||
|
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||||
|
other alerts.
|
||||||
|
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
||||||
|
severity of 'warning' or 'critical' starts firing on the same namespace.
|
||||||
|
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||||
|
summary: Info-level alert inhibition.
|
||||||
|
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||||
|
labels:
|
||||||
|
severity: none
|
||||||
|
- name: node-network
|
||||||
|
rules:
|
||||||
|
- alert: NodeNetworkInterfaceFlapping
|
||||||
|
annotations:
|
||||||
|
description: Network interface "{{ `{{` }} $labels.device }}" changing its up status often on node-exporter {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }}
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||||||
|
summary: Network interface is often changing its status
|
||||||
|
expr: |
|
||||||
|
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- name: kube-prometheus-node-recording.rules
|
||||||
|
rules:
|
||||||
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||||
|
record: instance:node_cpu:rate:sum
|
||||||
|
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||||
|
record: instance:node_network_receive_bytes:rate:sum
|
||||||
|
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||||
|
record: instance:node_network_transmit_bytes:rate:sum
|
||||||
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||||
|
record: instance:node_cpu:ratio
|
||||||
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||||
|
record: cluster:node_cpu:sum_rate5m
|
||||||
|
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||||
|
record: cluster:node_cpu:ratio
|
||||||
|
- name: kube-prometheus-general.rules
|
||||||
|
rules:
|
||||||
|
- expr: count without(instance, pod, node) (up == 1)
|
||||||
|
record: count:up1
|
||||||
|
- expr: count without(instance, pod, node) (up == 0)
|
||||||
|
record: count:up0
|
||||||
67
templates/PrometheusRule-kubeStateMetrics.yaml
Normal file
67
templates/PrometheusRule-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: kube-state-metrics-rules
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: kube-state-metrics
|
||||||
|
rules:
|
||||||
|
- alert: KubeStateMetricsListErrors
|
||||||
|
annotations:
|
||||||
|
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
||||||
|
summary: kube-state-metrics is experiencing errors in list operations.
|
||||||
|
expr: |
|
||||||
|
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
|
/
|
||||||
|
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
|
||||||
|
> 0.01
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: KubeStateMetricsWatchErrors
|
||||||
|
annotations:
|
||||||
|
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
||||||
|
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||||
|
expr: |
|
||||||
|
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
|
||||||
|
/
|
||||||
|
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
|
||||||
|
> 0.01
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: KubeStateMetricsShardingMismatch
|
||||||
|
annotations:
|
||||||
|
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||||
|
summary: kube-state-metrics sharding is misconfigured.
|
||||||
|
expr: |
|
||||||
|
stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: KubeStateMetricsShardsMissing
|
||||||
|
annotations:
|
||||||
|
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
||||||
|
summary: kube-state-metrics shards are missing.
|
||||||
|
expr: |
|
||||||
|
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
|
||||||
|
-
|
||||||
|
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
|
||||||
|
!= 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
1441
templates/PrometheusRule-kubernetesControlPlane.yaml
Normal file
1441
templates/PrometheusRule-kubernetesControlPlane.yaml
Normal file
File diff suppressed because it is too large
Load Diff
112
templates/PrometheusRule-loki.yaml
Normal file
112
templates/PrometheusRule-loki.yaml
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
---
|
||||||
|
{{- if .Values.nfc_monitoring.loki.enabled | default false -}}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: logging
|
||||||
|
app.kubernetes.io/name: loki
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: loki
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: loki_rules
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job))
|
||||||
|
record: cluster_job:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||||
|
by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
|
||||||
|
record: cluster_job:loki_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, job, route))
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||||
|
route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||||
|
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, cluster, namespace, job, route))
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
|
||||||
|
namespace, job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||||
|
job, route)
|
||||||
|
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
||||||
|
|
||||||
|
- name: loki_alerts
|
||||||
|
rules:
|
||||||
|
- alert: LokiRequestErrors
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
|
||||||
|
expr: |
|
||||||
|
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
|
||||||
|
/
|
||||||
|
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
|
||||||
|
> 10
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: LokiRequestPanics
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics.
|
||||||
|
expr: |
|
||||||
|
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: LokiRequestLatency
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
|
||||||
|
expr: |
|
||||||
|
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: LokiTooManyCompactorsRunning
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|
||||||
|
expr: |
|
||||||
|
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
{{- end -}}
|
||||||
318
templates/PrometheusRule-nodeExporter.yaml
Normal file
318
templates/PrometheusRule-nodeExporter.yaml
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: node-exporter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: node-exporter-rules
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: node-exporter
|
||||||
|
rules:
|
||||||
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||||
|
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||||
|
and
|
||||||
|
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up fast.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||||
|
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||||
|
and
|
||||||
|
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||||
|
summary: Filesystem has less than 5% space left.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||||
|
summary: Filesystem has less than 3% space left.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||||
|
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||||
|
and
|
||||||
|
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up fast.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||||
|
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||||
|
and
|
||||||
|
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||||
|
summary: Filesystem has less than 5% inodes left.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
|
annotations:
|
||||||
|
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||||
|
summary: Filesystem has less than 3% inodes left.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||||
|
and
|
||||||
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
|
)
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: NodeNetworkReceiveErrs
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||||
|
summary: Network interface is reporting many receive errors.
|
||||||
|
expr: |
|
||||||
|
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeNetworkTransmitErrs
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||||
|
summary: Network interface is reporting many transmit errors.
|
||||||
|
expr: |
|
||||||
|
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeHighNumberConntrackEntriesUsed
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} $value | humanizePercentage }} of conntrack entries are used.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||||
|
summary: Number of conntrack are getting close to the limit.
|
||||||
|
expr: |
|
||||||
|
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeTextFileCollectorScrapeError
|
||||||
|
annotations:
|
||||||
|
description: Node Exporter text file collector failed to scrape.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||||
|
summary: Node Exporter text file collector failed to scrape.
|
||||||
|
expr: |
|
||||||
|
node_textfile_scrape_error{job="node-exporter"} == 1
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeClockSkewDetected
|
||||||
|
annotations:
|
||||||
|
description: Clock on {{ `{{` }} $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||||
|
summary: Clock skew detected.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||||
|
and
|
||||||
|
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||||
|
)
|
||||||
|
or
|
||||||
|
(
|
||||||
|
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||||
|
and
|
||||||
|
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||||
|
)
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeClockNotSynchronising
|
||||||
|
annotations:
|
||||||
|
description: Clock on {{ `{{` }} $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||||
|
summary: Clock not synchronising.
|
||||||
|
expr: |
|
||||||
|
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||||
|
and
|
||||||
|
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeRAIDDegraded
|
||||||
|
annotations:
|
||||||
|
description: RAID array '{{ `{{` }} $labels.device }}' on {{ `{{` }} $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||||
|
summary: RAID Array is degraded
|
||||||
|
expr: |
|
||||||
|
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: NodeRAIDDiskFailure
|
||||||
|
annotations:
|
||||||
|
description: At least one device in RAID array on {{ `{{` }} $labels.instance }} failed. Array '{{ `{{` }} $labels.device }}' needs attention and possibly a disk swap.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||||
|
summary: Failed device in RAID array
|
||||||
|
expr: |
|
||||||
|
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeFileDescriptorLimit
|
||||||
|
annotations:
|
||||||
|
description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||||
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeFileDescriptorLimit
|
||||||
|
annotations:
|
||||||
|
description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||||
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- name: node-exporter.rules
|
||||||
|
rules:
|
||||||
|
- expr: |
|
||||||
|
count without (cpu, mode) (
|
||||||
|
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||||
|
)
|
||||||
|
record: instance:node_num_cpu:sum
|
||||||
|
- expr: |
|
||||||
|
1 - avg without (cpu) (
|
||||||
|
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||||
|
)
|
||||||
|
record: instance:node_cpu_utilisation:rate5m
|
||||||
|
- expr: |
|
||||||
|
(
|
||||||
|
node_load1{job="node-exporter"}
|
||||||
|
/
|
||||||
|
instance:node_num_cpu:sum{job="node-exporter"}
|
||||||
|
)
|
||||||
|
record: instance:node_load1_per_cpu:ratio
|
||||||
|
- expr: |
|
||||||
|
1 - (
|
||||||
|
(
|
||||||
|
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||||
|
or
|
||||||
|
(
|
||||||
|
node_memory_Buffers_bytes{job="node-exporter"}
|
||||||
|
+
|
||||||
|
node_memory_Cached_bytes{job="node-exporter"}
|
||||||
|
+
|
||||||
|
node_memory_MemFree_bytes{job="node-exporter"}
|
||||||
|
+
|
||||||
|
node_memory_Slab_bytes{job="node-exporter"}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
/
|
||||||
|
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||||
|
)
|
||||||
|
record: instance:node_memory_utilisation:ratio
|
||||||
|
- expr: |
|
||||||
|
rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||||
|
record: instance:node_vmstat_pgmajfault:rate5m
|
||||||
|
- expr: |
|
||||||
|
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||||
|
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||||
|
- expr: |
|
||||||
|
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||||
|
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum without (device) (
|
||||||
|
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||||
|
)
|
||||||
|
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum without (device) (
|
||||||
|
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||||
|
)
|
||||||
|
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum without (device) (
|
||||||
|
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||||
|
)
|
||||||
|
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||||
|
- expr: |
|
||||||
|
sum without (device) (
|
||||||
|
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||||
|
)
|
||||||
|
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||||
281
templates/PrometheusRule-prometheus.yaml
Normal file
281
templates/PrometheusRule-prometheus.yaml
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
prometheus: k8s
|
||||||
|
role: alert-rules
|
||||||
|
name: prometheus-k8s-prometheus-rules
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: prometheus
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusBadConfig
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to reload its configuration.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
|
||||||
|
summary: Failed Prometheus configuration reload.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PrometheusNotificationQueueRunningFull
|
||||||
|
annotations:
|
||||||
|
description: Alert notification queue of Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is running full.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
|
||||||
|
summary: Prometheus alert notification queue predicted to run full in less than 30m.
|
||||||
|
expr: |
|
||||||
|
# Without min_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
(
|
||||||
|
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
|
||||||
|
>
|
||||||
|
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} printf "%.1f" $value }}% errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to Alertmanager {{ `{{` }}$labels.alertmanager}}.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
|
||||||
|
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
/
|
||||||
|
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
* 100
|
||||||
|
> 1
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusNotConnectedToAlertmanagers
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not connected to any Alertmanagers.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
|
||||||
|
summary: Prometheus is not connected to any Alertmanagers.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusTSDBReloadsFailing
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} reload failures over the last 3h.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
|
||||||
|
summary: Prometheus has issues reloading blocks from disk.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||||
|
for: 4h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusTSDBCompactionsFailing
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} compaction failures over the last 3h.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
|
||||||
|
summary: Prometheus has issues compacting blocks.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
||||||
|
for: 4h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusNotIngestingSamples
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not ingesting samples.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
|
||||||
|
summary: Prometheus is not ingesting samples.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
||||||
|
and
|
||||||
|
(
|
||||||
|
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0
|
||||||
|
or
|
||||||
|
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusDuplicateTimestamps
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
|
||||||
|
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||||
|
expr: |
|
||||||
|
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusOutOfOrderTimestamps
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
|
||||||
|
summary: Prometheus drops samples with out-of-order timestamps.
|
||||||
|
expr: |
|
||||||
|
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusRemoteStorageFailures
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} failed to send {{ `{{` }} printf "%.1f" $value }}% of the samples to {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
|
||||||
|
summary: Prometheus fails to send samples to remote storage.
|
||||||
|
expr: |
|
||||||
|
(
|
||||||
|
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m]))
|
||||||
|
/
|
||||||
|
(
|
||||||
|
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus-k8s",namespace="monitoring"}[5m]))
|
||||||
|
+
|
||||||
|
(rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]))
|
||||||
|
)
|
||||||
|
)
|
||||||
|
* 100
|
||||||
|
> 1
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PrometheusRemoteWriteBehind
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write is {{ `{{` }} printf "%.1f" $value }}s behind for {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
|
||||||
|
summary: Prometheus remote write is behind.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
(
|
||||||
|
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
- ignoring(remote_name, url) group_right
|
||||||
|
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
> 120
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PrometheusRemoteWriteDesiredShards
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write desired shards calculation wants to run {{ `{{` }} $value }} shards for queue {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}, which is more than the max of {{ `{{` }} printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` $labels.instance | query | first | value }}.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
|
||||||
|
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
|
||||||
|
expr: |
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
(
|
||||||
|
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
>
|
||||||
|
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusRuleFailures
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to evaluate {{ `{{` }} printf "%.0f" $value }} rules in the last 5m.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
|
||||||
|
summary: Prometheus is failing rule evaluations.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PrometheusMissingRuleEvaluations
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has missed {{ `{{` }} printf "%.0f" $value }} rule group evaluations in the last 5m.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
|
||||||
|
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusTargetLimitHit
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
|
||||||
|
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusLabelLimitHit
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
|
||||||
|
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusScrapeBodySizeLimitHit
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
|
||||||
|
summary: Prometheus has dropped some targets that exceeded body size limit.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusScrapeSampleLimitHit
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
|
||||||
|
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusTargetSyncFailure
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} printf "%.0f" $value }} targets in Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} have failed to sync because invalid configuration was supplied.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
|
||||||
|
summary: Prometheus has failed to sync targets.
|
||||||
|
expr: |
|
||||||
|
increase(prometheus_target_sync_failed_total{job="prometheus-k8s",namespace="monitoring"}[30m]) > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PrometheusHighQueryLoad
|
||||||
|
annotations:
|
||||||
|
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
|
||||||
|
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
|
||||||
|
expr: |
|
||||||
|
avg_over_time(prometheus_engine_queries{job="prometheus-k8s",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.8
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
||||||
|
annotations:
|
||||||
|
description: '{{ `{{` }} printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to any Alertmanager.'
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
|
||||||
|
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
||||||
|
expr: |
|
||||||
|
min without (alertmanager) (
|
||||||
|
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m])
|
||||||
|
/
|
||||||
|
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m])
|
||||||
|
)
|
||||||
|
* 100
|
||||||
|
> 3
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
47
templates/Role-SpecificNamespaces-prometheus.yaml
Normal file
47
templates/Role-SpecificNamespaces-prometheus.yaml
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
items:
|
||||||
|
|
||||||
|
{{ range .Values.nfc_monitoring.prometheus.monitor_namespaces }}
|
||||||
|
- apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ . | quote }}
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- services
|
||||||
|
- endpoints
|
||||||
|
- pods
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- extensions
|
||||||
|
resources:
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiGroups:
|
||||||
|
- networking.k8s.io
|
||||||
|
resources:
|
||||||
|
- ingresses
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
kind: RoleList
|
||||||
21
templates/RoleBinding-Config-prometheus.yaml
Normal file
21
templates/RoleBinding-Config-prometheus.yaml
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s-config
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: prometheus-k8s-config
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
27
templates/RoleBinding-SpecificNamespaces-prometheus.yaml
Normal file
27
templates/RoleBinding-SpecificNamespaces-prometheus.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
items:
|
||||||
|
{{ range .Values.nfc_monitoring.prometheus.monitor_namespaces }}
|
||||||
|
- apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ . | quote }}
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: prometheus-k8s
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ $.Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
kind: RoleBindingList
|
||||||
18
templates/RoleBinding-prometheus-adapter-auth-reader.yaml
Normal file
18
templates/RoleBinding-prometheus-adapter-auth-reader.yaml
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/version: 0.11.1
|
||||||
|
name: resource-metrics-auth-reader
|
||||||
|
namespace: kube-system
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: extension-apiserver-authentication-reader
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
20
templates/RoleConfig-prometheus.yaml
Normal file
20
templates/RoleConfig-prometheus.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s-config
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
|
- configmaps
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
61
templates/Secret-alertmanager.yaml
Normal file
61
templates/Secret-alertmanager.yaml
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: alertmanager-main
|
||||||
|
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||||
|
stringData:
|
||||||
|
alertmanager.yaml: |-
|
||||||
|
"global":
|
||||||
|
"resolve_timeout": "5m"
|
||||||
|
"inhibit_rules":
|
||||||
|
- "equal":
|
||||||
|
- "namespace"
|
||||||
|
- "alertname"
|
||||||
|
"source_matchers":
|
||||||
|
- "severity = critical"
|
||||||
|
"target_matchers":
|
||||||
|
- "severity =~ warning|info"
|
||||||
|
- "equal":
|
||||||
|
- "namespace"
|
||||||
|
- "alertname"
|
||||||
|
"source_matchers":
|
||||||
|
- "severity = warning"
|
||||||
|
"target_matchers":
|
||||||
|
- "severity = info"
|
||||||
|
- "equal":
|
||||||
|
- "namespace"
|
||||||
|
"source_matchers":
|
||||||
|
- "alertname = InfoInhibitor"
|
||||||
|
"target_matchers":
|
||||||
|
- "severity = info"
|
||||||
|
"receivers":
|
||||||
|
- "name": "Default"
|
||||||
|
- "name": "Watchdog"
|
||||||
|
- "name": "Critical"
|
||||||
|
- "name": "null"
|
||||||
|
"route":
|
||||||
|
"group_by":
|
||||||
|
- "namespace"
|
||||||
|
"group_interval": "5m"
|
||||||
|
"group_wait": "30s"
|
||||||
|
"receiver": "Default"
|
||||||
|
"repeat_interval": "12h"
|
||||||
|
"routes":
|
||||||
|
- "matchers":
|
||||||
|
- "alertname = Watchdog"
|
||||||
|
"receiver": "Watchdog"
|
||||||
|
- "matchers":
|
||||||
|
- "alertname = InfoInhibitor"
|
||||||
|
"receiver": "null"
|
||||||
|
- "matchers":
|
||||||
|
- "severity = critical"
|
||||||
|
"receiver": "Critical"
|
||||||
|
type: Opaque
|
||||||
13
templates/Secret-prometheus-sidecar-thanos.yaml
Normal file
13
templates/Secret-prometheus-sidecar-thanos.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: thanos-sidecar-config
|
||||||
|
namespace: monitoring
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
thanos.yaml: |-
|
||||||
|
{{ toYaml .Values.nfc_monitoring.thanos.sidecar.config | nindent 4 }}
|
||||||
|
|
||||||
|
{{ end }}
|
||||||
20
templates/Service-CalicoMetrics.yaml
Normal file
20
templates/Service-CalicoMetrics.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
---
|
||||||
|
{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}}
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: calico-metrics
|
||||||
|
namespace: kube-system
|
||||||
|
labels:
|
||||||
|
k8s-app: calico-node
|
||||||
|
spec:
|
||||||
|
clusterIP: None
|
||||||
|
selector:
|
||||||
|
k8s-app: calico-node
|
||||||
|
ports:
|
||||||
|
- port: 9091
|
||||||
|
name: metrics
|
||||||
|
targetPort: 9091
|
||||||
|
|
||||||
|
{{- end -}}
|
||||||
30
templates/Service-Grafana.yaml
Normal file
30
templates/Service-Grafana.yaml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
#type: NodePort
|
||||||
|
#type: LoadBalancer
|
||||||
|
#clusterIP: None
|
||||||
|
ports:
|
||||||
|
- name: grafana
|
||||||
|
port: 3000
|
||||||
|
targetPort: grafana-http
|
||||||
|
#nodePort: 3000
|
||||||
|
#type: LoadBalancer
|
||||||
|
sessionAffinity: ClientIP
|
||||||
28
templates/Service-GrafanaAgent.yaml
Normal file
28
templates/Service-GrafanaAgent.yaml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: grafana-agent
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
ports:
|
||||||
|
- name: grafana-metrics
|
||||||
|
port: 12345
|
||||||
|
targetPort: grafana-metrics
|
||||||
|
- name: kube-ctrl-mgr
|
||||||
|
port: 11257
|
||||||
|
targetPort: kube-ctrl-mgr
|
||||||
|
#type: LoadBalancer
|
||||||
|
sessionAffinity: ClientIP
|
||||||
27
templates/Service-alertmanager.yaml
Normal file
27
templates/Service-alertmanager.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: alertmanager-main
|
||||||
|
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- name: web
|
||||||
|
port: 9093
|
||||||
|
targetPort: web
|
||||||
|
- name: reloader-web
|
||||||
|
port: 8080
|
||||||
|
targetPort: reloader-web
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
sessionAffinity: ClientIP
|
||||||
30
templates/Service-kube-monitor-proxy.yaml
Normal file
30
templates/Service-kube-monitor-proxy.yaml
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name_kcm: kube-controller-manager
|
||||||
|
name_ks: kube-scheduler
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
|
||||||
|
ports:
|
||||||
|
- name: kube-ctrl-mgr
|
||||||
|
port: 10257
|
||||||
|
targetPort: kube-ctrl-mgr
|
||||||
|
- name: kube-scheduler
|
||||||
|
port: 10259
|
||||||
|
targetPort: kube-scheduler
|
||||||
|
sessionAffinity: ClientIP
|
||||||
28
templates/Service-kubeStateMetrics.yaml
Normal file
28
templates/Service-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-state-metrics
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
clusterIP: None
|
||||||
|
ports:
|
||||||
|
- name: https-main
|
||||||
|
port: 8443
|
||||||
|
targetPort: https-main
|
||||||
|
- name: https-self
|
||||||
|
port: 9443
|
||||||
|
targetPort: https-self
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
|
||||||
23
templates/Service-prometheus-adapter.yaml
Normal file
23
templates/Service-prometheus-adapter.yaml
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- name: https
|
||||||
|
port: 443
|
||||||
|
targetPort: 6443
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
59
templates/Service-prometheus.yaml
Normal file
59
templates/Service-prometheus.yaml
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
ports:
|
||||||
|
- name: web
|
||||||
|
port: 9090
|
||||||
|
targetPort: web
|
||||||
|
- name: reloader-web
|
||||||
|
port: 8080
|
||||||
|
targetPort: reloader-web
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
sessionAffinity: ClientIP
|
||||||
|
|
||||||
|
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
|
||||||
|
---
|
||||||
|
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus-sidecar
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: thanos-sidecar
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: thanos-sidecar
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
|
spec:
|
||||||
|
clusterIP: None
|
||||||
|
ports:
|
||||||
|
- name: grpc
|
||||||
|
port: 10901
|
||||||
|
targetPort: 10901
|
||||||
|
- name: http
|
||||||
|
port: 10902
|
||||||
|
targetPort: 10902
|
||||||
|
selector:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
|
||||||
|
{{ end }}
|
||||||
14
templates/ServiceAccount-Grafana.yaml
Normal file
14
templates/ServiceAccount-Grafana.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
14
templates/ServiceAccount-GrafanaAgent.yaml
Normal file
14
templates/ServiceAccount-GrafanaAgent.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: grafana-agent
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
14
templates/ServiceAccount-alertmanager.yaml
Normal file
14
templates/ServiceAccount-alertmanager.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: alert-router
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: alertmanager
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: alertmanager-main
|
||||||
|
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
|
||||||
13
templates/ServiceAccount-kube-monitor-proxy.yaml
Normal file
13
templates/ServiceAccount-kube-monitor-proxy.yaml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
name: kube-monitor-proxy
|
||||||
|
namespace: monitoring
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
14
templates/ServiceAccount-kubeStateMetrics.yaml
Normal file
14
templates/ServiceAccount-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-state-metrics
|
||||||
|
namespace: monitoring
|
||||||
14
templates/ServiceAccount-prometheus-adapter.yaml
Normal file
14
templates/ServiceAccount-prometheus-adapter.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: false
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: metrics-adapter
|
||||||
|
app.kubernetes.io/instance: main
|
||||||
|
app.kubernetes.io/name: prometheus-adapter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-adapter
|
||||||
|
namespace: monitoring
|
||||||
14
templates/ServiceAccount-prometheus.yaml
Normal file
14
templates/ServiceAccount-prometheus.yaml
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
automountServiceAccountToken: true
|
||||||
|
kind: ServiceAccount
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: prometheus
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: prometheus
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
78
templates/ServiceMonitor-APIServer.yaml
Normal file
78
templates/ServiceMonitor-APIServer.yaml
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: apiserver
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
name: kube-apiserver
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 30s
|
||||||
|
metricRelabelings:
|
||||||
|
- action: drop
|
||||||
|
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: etcd_(debugging|disk|server).*
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: apiserver_admission_controller_admission_latencies_seconds_.*
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: apiserver_admission_step_admission_latencies_seconds_.*
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- le
|
||||||
|
port: https
|
||||||
|
scheme: https
|
||||||
|
tlsConfig:
|
||||||
|
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||||
|
serverName: kubernetes
|
||||||
|
jobLabel: component
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- default
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
component: apiserver
|
||||||
|
provider: kubernetes
|
||||||
52
templates/ServiceMonitor-Cadvisor.yaml
Normal file
52
templates/ServiceMonitor-Cadvisor.yaml
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: cadvisor
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
name: cadvisor
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
honorLabels: true
|
||||||
|
honorTimestamps: false
|
||||||
|
interval: 30s
|
||||||
|
metricRelabelings:
|
||||||
|
- action: drop
|
||||||
|
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- pod
|
||||||
|
- namespace
|
||||||
|
- action: drop
|
||||||
|
regex: (container_blkio_device_usage_total);.+
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- container
|
||||||
|
path: /metrics/cadvisor
|
||||||
|
port: https-metrics
|
||||||
|
#jobName: cadvisor
|
||||||
|
relabelings:
|
||||||
|
- sourceLabels:
|
||||||
|
- __metrics_path__
|
||||||
|
targetLabel: metrics_path
|
||||||
|
- targetLabel: "job"
|
||||||
|
replacement: "cadvisor"
|
||||||
|
scheme: https
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- kube-system
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: kubelet
|
||||||
36
templates/ServiceMonitor-Calico.yaml
Normal file
36
templates/ServiceMonitor-Calico.yaml
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}}
|
||||||
|
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: calico
|
||||||
|
app.kubernetes.io/component: networking
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
k8s-app: calico-node
|
||||||
|
name: calico
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 15s
|
||||||
|
port: metrics
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: instance
|
||||||
|
scheme: http
|
||||||
|
# tlsConfig:
|
||||||
|
# insecureSkipVerify: true
|
||||||
|
jobLabel: app.kubernetes.io/name
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
k8s-app: calico-node
|
||||||
|
|
||||||
|
{{- end -}}
|
||||||
28
templates/ServiceMonitor-CoreDNS.yaml
Normal file
28
templates/ServiceMonitor-CoreDNS.yaml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: coredns
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
name: coredns
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 15s
|
||||||
|
metricRelabelings:
|
||||||
|
- action: drop
|
||||||
|
regex: coredns_cache_misses_total
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
port: metrics
|
||||||
|
jobLabel: app.kubernetes.io/name
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- kube-system
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
k8s-app: kube-dns
|
||||||
37
templates/ServiceMonitor-Grafana.yaml
Normal file
37
templates/ServiceMonitor-Grafana.yaml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: grafana
|
||||||
|
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 15s
|
||||||
|
port: grafana
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_label_app_kubernetes_io_instance
|
||||||
|
targetLabel: instance
|
||||||
|
scheme: http
|
||||||
|
# tlsConfig:
|
||||||
|
# insecureSkipVerify: true
|
||||||
|
targetLabels:
|
||||||
|
- cluster
|
||||||
|
jobLabel: app.kubernetes.io/name
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: graphing
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: grafana
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
87
templates/ServiceMonitor-Kubelet.yaml
Normal file
87
templates/ServiceMonitor-Kubelet.yaml
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: kubelet
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
name: kubelet
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
honorLabels: true
|
||||||
|
interval: 30s
|
||||||
|
metricRelabelings:
|
||||||
|
- action: drop
|
||||||
|
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
port: https-metrics
|
||||||
|
relabelings:
|
||||||
|
- sourceLabels:
|
||||||
|
- __metrics_path__
|
||||||
|
targetLabel: metrics_path
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_endpoint_address_target_name
|
||||||
|
targetLabel: instance
|
||||||
|
scheme: https
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
honorLabels: true
|
||||||
|
interval: 30s
|
||||||
|
path: /metrics/probes
|
||||||
|
port: https-metrics
|
||||||
|
relabelings:
|
||||||
|
- sourceLabels:
|
||||||
|
- __metrics_path__
|
||||||
|
targetLabel: metrics_path
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_endpoint_address_target_name
|
||||||
|
targetLabel: instance
|
||||||
|
scheme: https
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
namespaceSelector:
|
||||||
|
matchNames:
|
||||||
|
- kube-system
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/name: kubelet
|
||||||
43
templates/ServiceMonitor-Node.yaml
Normal file
43
templates/ServiceMonitor-Node.yaml
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: node
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
#cluster: dev
|
||||||
|
name: node
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 5s
|
||||||
|
honorLabels: true
|
||||||
|
path: /metrics
|
||||||
|
port: grafana-metrics
|
||||||
|
scheme: http
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: instance
|
||||||
|
- targetLabel: "job"
|
||||||
|
replacement: "node-exporter"
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: node
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
39
templates/ServiceMonitor-ceph.yaml
Normal file
39
templates/ServiceMonitor-ceph.yaml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
---
|
||||||
|
{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}}
|
||||||
|
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/name: ceph
|
||||||
|
app.kubernetes.io/component: storage
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
name: ceph
|
||||||
|
namespace: "{{ .Values.nfc_monitoring.additions.ceph.namespace }}"
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 15s
|
||||||
|
port: http-metrics
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_label_app_kubernetes_io_part_of
|
||||||
|
targetLabel: instance
|
||||||
|
- sourceLabels:
|
||||||
|
- __metrics_path__
|
||||||
|
targetLabel: metrics_path
|
||||||
|
- targetLabel: "job"
|
||||||
|
replacement: "ceph"
|
||||||
|
scheme: http
|
||||||
|
targetLabels:
|
||||||
|
- cluster
|
||||||
|
selector:
|
||||||
|
{{ .Values.nfc_monitoring.additions.ceph.ServiceMonitor.selector | toYaml | indent 4 }}
|
||||||
|
|
||||||
|
|
||||||
|
{{- end -}}
|
||||||
82
templates/ServiceMonitor-kube-controller-manager.yaml
Normal file
82
templates/ServiceMonitor-kube-controller-manager.yaml
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-controller-manager
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name_kcm: kube-controller-manager
|
||||||
|
name: kube-controller-manager
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 10s
|
||||||
|
honorLabels: true
|
||||||
|
path: /metrics
|
||||||
|
port: kube-ctrl-mgr
|
||||||
|
scheme: https
|
||||||
|
# labels:
|
||||||
|
# job: kube-controller-manager
|
||||||
|
relabelings:
|
||||||
|
# - action: replace
|
||||||
|
# regex: (.*)
|
||||||
|
# replacement: $1
|
||||||
|
# sourceLabels:
|
||||||
|
# - __meta_kubernetes_pod_node_name
|
||||||
|
# targetLabel: instance
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_service_label_name_kcm
|
||||||
|
targetLabel: job
|
||||||
|
metricRelabelings:
|
||||||
|
- action: drop
|
||||||
|
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: transformation_(transformation_latencies_microseconds|failures_total)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
- action: drop
|
||||||
|
regex: etcd_(debugging|disk|request|server).*
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
36
templates/ServiceMonitor-kube-scheduler.yaml
Normal file
36
templates/ServiceMonitor-kube-scheduler.yaml
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: proxy
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-scheduler
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-scheduler
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 10s
|
||||||
|
honorLabels: true
|
||||||
|
path: /metrics
|
||||||
|
port: kube-scheduler
|
||||||
|
scheme: https
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_service_label_name_ks
|
||||||
|
targetLabel: job
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-monitor-proxy
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
46
templates/ServiceMonitor-kubeStateMetrics.yaml
Normal file
46
templates/ServiceMonitor-kubeStateMetrics.yaml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
name: kube-state-metrics
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
honorLabels: true
|
||||||
|
interval: 30s
|
||||||
|
metricRelabelings:
|
||||||
|
- action: drop
|
||||||
|
regex: kube_endpoint_address_not_ready|kube_endpoint_address_available
|
||||||
|
sourceLabels:
|
||||||
|
- __name__
|
||||||
|
port: https-main
|
||||||
|
relabelings:
|
||||||
|
- action: labeldrop
|
||||||
|
regex: (pod|service|endpoint|namespace)
|
||||||
|
scheme: https
|
||||||
|
scrapeTimeout: 30s
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 30s
|
||||||
|
port: https-self
|
||||||
|
scheme: https
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
jobLabel: app.kubernetes.io/name
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/name: kube-state-metrics
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
44
templates/ServiceMonitor-node-exporter.yaml
Normal file
44
templates/ServiceMonitor-node-exporter.yaml
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: node-exporter
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
|
name: node-exporter
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
|
||||||
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
|
interval: 5s
|
||||||
|
honorLabels: true
|
||||||
|
path: /integrations/node_exporter/metrics
|
||||||
|
port: grafana-metrics
|
||||||
|
scheme: http
|
||||||
|
relabelings:
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: instance
|
||||||
|
- targetLabel: "job"
|
||||||
|
replacement: "node-exporter"
|
||||||
|
- action: replace
|
||||||
|
regex: (.*)
|
||||||
|
replacement: $1
|
||||||
|
sourceLabels:
|
||||||
|
- __meta_kubernetes_pod_node_name
|
||||||
|
targetLabel: node
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app.kubernetes.io/instance: k8s
|
||||||
|
app.kubernetes.io/component: exporter
|
||||||
|
app.kubernetes.io/name: grafana-agent
|
||||||
|
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user