170 Commits

Author SHA1 Message Date
d25cee3c8c Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!28
2025-02-26 13:09:42 +00:00
05bd0cd6d4 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "6f8dfcba0b25313b59bc17b4c99d674fcedd207a)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "224ef831571458ad433a0143eec00df0f7e8b409)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 224ef831571458ad433a0143eec00df0f7e8b409

MR !28
2025-02-26 13:08:57 +00:00
ac6b7ef31e Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!25
2024-08-19 07:25:36 +00:00
6b44c1df69 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "58ffcabbfb503af3e57d9cb3ab43931b23dc4cd8)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "6f8dfcba0b25313b59bc17b4c99d674fcedd207a)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 6f8dfcba0b25313b59bc17b4c99d674fcedd207a

MR !25
2024-08-19 07:24:49 +00:00
d5deb022c5 Merge branch 'automated-tasks' into 'development'
chore(website-template): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!24
2024-08-01 06:09:00 +00:00
21c5e87c85 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "673441f83a7d943434252ee23899e3572cdfb141)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "58ffcabbfb503af3e57d9cb3ab43931b23dc4cd8)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 58ffcabbfb503af3e57d9cb3ab43931b23dc4cd8

MR !24
2024-08-01 06:08:17 +00:00
6ac6ced4bb chore(git): updated submodule website-template
Automation Data:
{
    "branch": "development",
    "current_commit": "92c4b16a14524e7b5b18171d4e21b72676c36fbf)",
    "name": "website-template",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/website-template",
    "remote_head": "8735f623dc5e1d9c9e46e50db03b4c41cb3d1efd)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/infrastructure/website-template.git"
}

Changes: Submodule path website-template: checked out 8735f623dc5e1d9c9e46e50db03b4c41cb3d1efd

MR !24
2024-06-30 18:03:35 +00:00
39f26c8f82 Merge branch 'automated-tasks' into 'development'
chore(website-template): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!23
2024-06-30 17:48:54 +00:00
13e60d0a76 chore(git): updated submodule website-template
Automation Data:
{
    "branch": "development",
    "current_commit": "f5a82d3604faca56756eec91acee28ff89defd1d)",
    "name": "website-template",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/website-template",
    "remote_head": "92c4b16a14524e7b5b18171d4e21b72676c36fbf)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/infrastructure/website-template.git"
}

Changes: Submodule path website-template: checked out 92c4b16a14524e7b5b18171d4e21b72676c36fbf

MR !23
2024-06-30 17:48:10 +00:00
22d3308464 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!22
2024-06-30 17:46:48 +00:00
a83648e2ac chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "a24f352ca3d82b8d0f02f5db20173fe2c3f71a4a)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "673441f83a7d943434252ee23899e3572cdfb141)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 673441f83a7d943434252ee23899e3572cdfb141

MR !22
2024-06-30 17:46:02 +00:00
b01c6bbb06 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!21
2024-03-16 11:35:32 +00:00
d6e21083c9 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "9afa68d1f3849e491fa8ca034749388808531b74)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "a24f352ca3d82b8d0f02f5db20173fe2c3f71a4a)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out a24f352ca3d82b8d0f02f5db20173fe2c3f71a4a

MR !21
2024-03-16 11:34:52 +00:00
bfa20b6a09 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!20
2024-03-14 12:47:17 +00:00
d6cf67b930 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "41eeb7badd582175b371cd4a5b2192decbcb0210)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "9afa68d1f3849e491fa8ca034749388808531b74)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 9afa68d1f3849e491fa8ca034749388808531b74

MR !20
2024-03-14 12:46:20 +00:00
5aea6f620e Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!19
2024-02-25 09:32:31 +00:00
3a31680ab5 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "6f80ea3af7fdc64e9998820a8800c288d7facbc6)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "41eeb7badd582175b371cd4a5b2192decbcb0210)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 41eeb7badd582175b371cd4a5b2192decbcb0210

MR !19
2024-02-25 09:31:52 +00:00
e22e98fd08 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!18
2024-02-24 06:31:22 +00:00
78ff4066ea chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "4f65bc1367585146490637dfc7c57c987216e652)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "6f80ea3af7fdc64e9998820a8800c288d7facbc6)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 6f80ea3af7fdc64e9998820a8800c288d7facbc6

MR !18
2024-02-24 06:30:29 +00:00
3b33cf43d8 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!17
2024-02-23 09:07:43 +00:00
e5bc593611 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "8094694d43449f1d17b763e215485b2950e6b6b4)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "4f65bc1367585146490637dfc7c57c987216e652)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 4f65bc1367585146490637dfc7c57c987216e652

MR !17
2024-02-23 09:06:59 +00:00
e4a98648e2 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!16
2024-02-23 02:52:07 +00:00
8d1ad238e4 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "34c81c98494b5ce448f4da4e645952439c897906)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "8094694d43449f1d17b763e215485b2950e6b6b4)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 8094694d43449f1d17b763e215485b2950e6b6b4

MR !16
2024-02-23 02:51:26 +00:00
2bfa45d5a3 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!15
2024-02-22 09:54:06 +00:00
a33a0514d7 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "c7c966bbee4fefc044d4c58e60dd5f10ec63862b)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "34c81c98494b5ce448f4da4e645952439c897906)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 34c81c98494b5ce448f4da4e645952439c897906

MR !15
2024-02-22 09:53:24 +00:00
ea59c866d6 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!14
2024-02-22 08:35:20 +00:00
2bdde14a5a chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "74ac15925c75ba0178ae21932b02b6a90a9169c3)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "c7c966bbee4fefc044d4c58e60dd5f10ec63862b)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out c7c966bbee4fefc044d4c58e60dd5f10ec63862b

MR !14
2024-02-22 08:34:39 +00:00
07be00f24c Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!13
2024-02-22 08:05:01 +00:00
452087a111 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "e046f9ea49a617ce91ff2eda53b897f798dfb810)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "74ac15925c75ba0178ae21932b02b6a90a9169c3)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 74ac15925c75ba0178ae21932b02b6a90a9169c3

MR !13
2024-02-22 08:04:18 +00:00
df7917aef8 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!12
2024-02-22 06:31:22 +00:00
dbacb1794c chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "d29064f1490073599518b629c7bf6585b48c8736)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "e046f9ea49a617ce91ff2eda53b897f798dfb810)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out e046f9ea49a617ce91ff2eda53b897f798dfb810

MR !12
2024-02-22 06:30:38 +00:00
eecb42a4e5 Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!11
2024-02-17 04:50:21 +00:00
d53d28314e chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "02252db664a428e83fb9ae24662b56b53e615989)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "d29064f1490073599518b629c7bf6585b48c8736)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out d29064f1490073599518b629c7bf6585b48c8736

MR !11
2024-02-17 04:49:41 +00:00
0a0d37e44d Merge branch 'automated-tasks' into 'development'
chore(gitlab-ci): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!10
2024-02-15 13:49:09 +00:00
65e247958c chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "bea398200f838736c6111a399478667df37435cb)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "02252db664a428e83fb9ae24662b56b53e615989)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out 02252db664a428e83fb9ae24662b56b53e615989

MR !10
2024-02-15 13:48:27 +00:00
fce97d5aa2 Merge branch 'automated-tasks' into 'development'
chore(website-template): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!9
2024-02-08 05:32:49 +00:00
b2d3cad87d chore(git): updated submodule website-template
Automation Data:
{
    "branch": "development",
    "current_commit": "2bcc17652babd4027e7245c6367841e2580ec317)",
    "name": "website-template",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/website-template",
    "remote_head": "f5a82d3604faca56756eec91acee28ff89defd1d)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/infrastructure/website-template.git"
}

Changes: Submodule path website-template: checked out f5a82d3604faca56756eec91acee28ff89defd1d

MR !9
2024-02-08 05:32:09 +00:00
Jon
496c7637c3 Merge branch 'feat-refine-deployment-options' into 'development'
feat: refine deployment options

See merge request nofusscomputing/projects/kubernetes_monitoring!8
2024-02-07 08:32:40 +00:00
Jon
99e503324d feat(grafana_dashboard): add cert manager dashboard
!8
2024-02-05 16:01:08 +09:30
Jon
486f2c4728 refactor(service_monitor): use job name prometheus for prometheus
required for built in grafana dashboards to work

!8
2024-02-05 15:11:49 +09:30
Jon
a2c3daa44e fix(service_monitor): correct syntax for calico
!8
2024-02-05 14:17:27 +09:30
Jon
faf4abf6b3 feat(grafana_dashboard): add calico dashboard
!8
2024-02-05 14:16:28 +09:30
Jon
cd2b99dd3d fix(service_monitor): dont deploy calico unless enabled
!8
2024-02-05 14:15:53 +09:30
Jon
efd6d15dc4 fix(role_binding): use namespace lookup to build role bindings
!8
2024-02-05 13:37:10 +09:30
Jon
f08cba1dfb feat(grafana): update to latest version 10.3.1
!8
2024-02-05 13:36:30 +09:30
Jon
49bf414caa refactor(network_policy): move network policy to template
!8
2024-02-05 11:31:39 +09:30
Jon
36ee3a10ff feat(role): dynamically add roles to all available namesapces
!8
2024-02-05 10:38:28 +09:30
Jon
6a20b69910 docs(values): notate remotewirte configured seperatly
!8
2024-02-05 10:25:52 +09:30
Jon
1fd5e49247 refactor(prometheus_rule): correct common rules name
!8
2024-02-04 20:26:39 +09:30
Jon
5323377852 fix(prometheus): use alertmanager instance name
!8
2024-02-04 20:24:46 +09:30
Jon
4e8f25ec3d refactor(prometheus_rule): move watchdog and info inhibitor to common rules file
these rules ar for all metrics rules

!8
2024-02-04 20:22:58 +09:30
Jon
cb12f338f1 fix(prometheus_rule): dont deploy loki rules of not configured
!8
2024-02-04 20:12:45 +09:30
Jon
39af78c6ea fix(prometheus_rule): use instance name instead of hard coded value
!8
2024-02-04 20:05:28 +09:30
Jon
73f25cfaa2 feat(prometheus_rule): add node exporter absent alert for ALL nodes
!8
2024-02-04 19:58:15 +09:30
Jon
beaa4f4896 fix(calico): use the operator created ns for monitoring
!8
2024-02-04 17:33:16 +09:30
Jon
e8b4b5a00b fix(ingress): move from values to template for dynomagic setup
!8
2024-02-04 17:07:31 +09:30
Jon
490e497d15 refactor: remove k8s and use release name for instance
!8
2024-02-04 17:06:39 +09:30
Jon
38f08985f5 fix(values): fix service_monitor variable
!8
2024-02-04 16:07:49 +09:30
Jon
c7746122cd fix(prometheus): service monitor missing close
!8
2024-02-04 15:56:50 +09:30
Jon
57a1706590 chore: set values.yaml to have sensible defaults
!8
2024-02-04 15:55:16 +09:30
Jon
bdb555a4b5 feat: turn off k8s non-metrics server deployed features
!8
2024-02-04 15:51:28 +09:30
Jon
bdb3a09c2b feat(kube_monitor_proxy): don't deploy if not enabled
!8
2024-02-04 15:49:43 +09:30
Jon
9c35a4d140 feat(kube_state_metrics): don't deploy if not enabled
!8
2024-02-04 15:38:19 +09:30
Jon
bc4d72ff8e feat(service_monitor): don't deploy if not enabled
!8
2024-02-04 15:33:50 +09:30
Jon
e13d55e61e feat(grafana_agent): not enable if not configured
!8
2024-02-04 15:08:53 +09:30
Jon
257da9cd38 fix(prometheus): configurable image and tag
!8
2024-02-04 15:07:13 +09:30
Jon
106f2e6ec8 feat(prometheus): configurable remote write
!8
2024-02-04 15:06:44 +09:30
Jon
f1c54567a7 feat(prometheus): configurable replicas
!8
2024-02-04 15:06:16 +09:30
Jon
cd2bceec3a feat(thanos_sidecar): if enabled must also be configured for deploy to occur
!8
2024-02-04 15:04:24 +09:30
Jon
18649086b5 feat(alertmanager): instance name set to pod name
!8
2024-02-04 15:02:34 +09:30
Jon
e0cb8f57e2 feat(grafana): enabled/disabled configurable
!8
2024-02-04 15:02:00 +09:30
Jon
c5bb46f48a feat(alertmanager): configurable replicas
!8
2024-02-04 14:58:08 +09:30
103a529184 Merge branch 'automated-tasks' into 'development'
chore(website-template): Automated update of git sub-module

See merge request nofusscomputing/projects/kubernetes_monitoring!7
2024-02-02 13:12:16 +00:00
51a187bb75 chore(git): updated submodule gitlab-ci
Automation Data:
{
    "branch": "development",
    "current_commit": "a5a9fa44374107657b2587ce52607d96a825be56)",
    "name": "gitlab-ci",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/gitlab-ci",
    "remote_head": "bea398200f838736c6111a399478667df37435cb)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/projects/gitlab-ci.git"
}

Changes: Submodule path gitlab-ci: checked out bea398200f838736c6111a399478667df37435cb

MR !7
2024-02-02 13:11:36 +00:00
f613ba29cd chore(git): updated submodule website-template
Automation Data:
{
    "branch": "development",
    "current_commit": "992b54805b8b6c78a3d2a5ea7de71c7be2b070c8)",
    "name": "website-template",
    "path": "/builds/nofusscomputing/projects/kubernetes_monitoring/_automation_/website-template",
    "remote_head": "2bcc17652babd4027e7245c6367841e2580ec317)",
    "remote_name": "origin",
    "url": "https://gitlab.com/nofusscomputing/infrastructure/website-template.git"
}

Changes: Submodule path website-template: checked out 2bcc17652babd4027e7245c6367841e2580ec317

MR !7
2024-02-02 13:11:27 +00:00
Jon
3eafca6c41 feat(nfc_automation): add automation file
!5
2024-02-02 22:32:19 +09:30
f406e891b7 build(version): bump version 0.2.0rc0 → 0.2.0 2023-11-19 15:09:48 +00:00
Jon
61289121e2 Merge branch '2-fix-multiple-issues' into 'development'
fix: multiple

See merge request nofusscomputing/projects/kubernetes_monitoring!5
2023-11-19 14:17:02 +00:00
Jon
11563741a9 fix(ingress): underscore removed from ingress name
!5 nofusscomputing/infrastructure/configuration-management/inventory-production!3
2023-11-19 18:40:37 +09:30
Jon
33883ec640 fix(ingress): function bool not defined
!5 nofusscomputing/infrastructure/configuration-management/inventory-production!3
2023-11-19 18:31:28 +09:30
Jon
562bb7d6f5 feat(prometheus): added config for ingress
!5 nofusscomputing/infrastructure/configuration-management/inventory-production!3
2023-11-19 18:03:17 +09:30
Jon
26a78bdb33 feat(grafana): added config for ingress
!5 nofusscomputing/infrastructure/configuration-management/inventory-production!3
2023-11-19 18:02:38 +09:30
Jon
65534660dd feat(alert_manager): added config for ingress
!5 nofusscomputing/infrastructure/configuration-management/inventory-production!3
2023-11-19 18:00:22 +09:30
Jon
593d73e4c6 fix(grafana): PVC access mode default to ReadWriteOnce
option is also configurable

!5 nofusscomputing/infrastructure/configuration-management/project-production!4
2023-11-19 15:01:15 +09:30
Jon
b6d0b18ece fix(prometheus): use correct variable for storage values
!5 nofusscomputing/infrastructure/configuration-management/project-production!4
2023-11-19 13:24:49 +09:30
b438599db4 build(version): bump version 0.1.0 → 0.2.0rc0 2023-11-06 08:22:43 +00:00
Jon
6f83fb5406 Merge branch 'bits-and-bobs' into 'development'
chore: randomz

See merge request nofusscomputing/projects/kubernetes_monitoring!4
2023-11-06 08:19:53 +00:00
Jon
b047f33e27 feat(Grafana): Configure grafana from values.yaml
!4
2023-09-30 12:14:47 +09:30
Jon
0c296c173e feat(GrafanaAgent): Network Policy added
!4
2023-09-29 19:11:16 +09:30
Jon
7164bb2e0b feat(grafana): Network Policy added
!4
2023-09-29 19:10:53 +09:30
Jon
ae35e704e3 feat(prometheus): Network Policy added
!4
2023-09-29 19:10:31 +09:30
Jon
5c62055d2f feat(NetworkPolicy): configure from values.yaml
!4
2023-09-29 19:09:50 +09:30
Jon
03affbee80 feat(labels): lables moved to values.yaml
!4
2023-09-29 19:08:48 +09:30
Jon
65b61ecf54 feat(graph): add ceph size/used to dashboard
!4
2023-09-29 10:57:28 +09:30
Jon
cf38a4156a refactor(grafana_datasources): default prom set to thanos
!4
2023-09-28 09:09:53 +09:30
Jon
7afc6aa515 feat(prometheus): rule selector configurable from values.yaml
!4
2023-09-28 09:08:26 +09:30
Jon
2371adcbc5 feat(promtail): split node to seperate promtail service monitor
this allows the mixins to work, specifically promtail

!4
2023-09-28 09:07:11 +09:30
Jon
f73a9e462e feat(grafana_agent): add prom rules
!4
2023-09-28 09:05:01 +09:30
Jon
5bb7197129 feat(prometheus): tsdb retention set
!4
2023-09-28 02:06:06 +09:30
Jon
be0161876d feat(thanos): prometheus rules added for sidecar
!4
2023-09-28 02:06:06 +09:30
014156eb05 build(version): bump version 0.0.1 → 0.1.0 2023-09-27 15:36:31 +00:00
Jon
376cf3ae9f Merge branch '1-metrics-long-term-storage' into 'development'
feat(metrics): long term storage injection

Closes #1

See merge request nofusscomputing/projects/kubernetes_monitoring!2
2023-09-27 15:31:08 +00:00
Jon
c76b753618 docs: update
!2
2023-09-28 00:53:48 +09:30
Jon
6dd413a0d4 docs(metrics_storage): added docs on how to configure external metrics routing
!2 closes #1
2023-09-28 00:48:00 +09:30
Jon
ffc0f84669 refactor(dashboard): nfc cluster overview to use source variable
!2
2023-09-28 00:33:43 +09:30
Jon
5add73c411 feat(grafana): DataSources configurable in values.yaml
!2 #1
2023-09-28 00:32:43 +09:30
Jon
239837ecbf feat(prometheus): ability to add additional config settings
settings are pertinent to the kubernetes prometheus object used by the prom operator.

!2 #1
2023-09-28 00:27:49 +09:30
Jon
ac812c2c7b feat(prometheus): thanos sidecar option added
!2 #1
2023-09-28 00:15:40 +09:30
Jon
9b12fcf5bc Merge branch 'genesis' into 'development'
chore: migrate chart to public repo

See merge request nofusscomputing/projects/kubernetes_monitoring!1
2023-09-27 05:24:50 +00:00
Jon
d4e529aec9 docs: add more features
!1
2023-09-27 14:47:19 +09:30
Jon
e7480105f7 feat(grafana_dashboard): nfc custom, cluster overview
!1
2023-09-27 14:39:43 +09:30
Jon
c8ea929873 feat(kyverno): add clusterpolicy role and rolebinding
cluster policy creates the role and rolebindings for prometheuse to monitor the ns

!1
2023-09-26 06:27:20 +09:30
Jon
899c6a3d78 refactor(prometheus): adjusted role/rolbinding manifest to loop
!1
2023-09-26 06:25:19 +09:30
Jon
817b838655 refactor(grafana_agent): clean up config file
!1
2023-09-26 01:30:06 +09:30
Jon
b7cfebf92b fix(kubestateproxy): bump proxy cpu limit
!1
2023-09-25 16:49:18 +09:30
Jon
e3648324f2 feat(grafana_agent): attach hostname to logs and metrics
!1
2023-09-25 16:46:21 +09:30
Jon
a26f887fa4 fix(grafana): use named pvc so it's reused
templated pvc as using a different name al the time. this caused left
over pvc that were unused. name set so they can be reused

!1
2023-09-25 16:42:45 +09:30
Jon
8f53b0fa07 feat(grafana): sidecar for loading dashboards from configmap
!1
2023-09-25 16:42:45 +09:30
Jon
82cd91a859 docs: added missing attribution
!1
2023-09-25 16:42:45 +09:30
Jon
944d615331 fix(ceph): PromRule CephPGImbalance adjusted to group by node
balancing is done by hostname not osd.

!1
2023-09-25 16:42:45 +09:30
Jon
4198ac78da docs: add blurb on values
!1
2023-09-25 16:42:45 +09:30
Jon
3a4ca30e68 feat(servicemonitor): ceph use cluster name for instance
!1
2023-09-25 16:42:45 +09:30
Jon
e2540a955f feat(servicemonitor): for prometheus, use pod name for instance
!1
2023-09-25 16:42:45 +09:30
Jon
89f0feae08 fix(grafana): dont use operator readiness prob
operator defines endpoint :3000/api/health, which fails with invalid argument

!1
2023-09-25 16:42:45 +09:30
Jon
0d1d0a34d8 feat(grafana): add affinity to values
!1
2023-09-25 16:42:45 +09:30
Jon
83fb30b4ed feat(prometheus): add storage to values
!1
2023-09-25 16:42:45 +09:30
Jon
7add74fbb1 refactor: yaml object ordering
!1
2023-09-25 16:42:45 +09:30
Jon
c742eea38d feat(prometheus_adaptor): add affinity to values
!1
2023-09-25 16:42:45 +09:30
Jon
836cc111e0 feat(prometheus): add affinity to values
!1
2023-09-25 16:42:45 +09:30
Jon
d189dfe0ee chore(versions): update image versions alertmanager,grafana
!1
2023-09-25 16:42:45 +09:30
Jon
2331bcbba3 feat(logging): add log files from /var/log
!1
2023-09-25 16:42:45 +09:30
Jon
8b966b0f0a feat(grafana-agent): expand env vars in config
this allows using the hosts env vars in the config file

!1
2023-09-21 18:55:49 +09:30
Jon
3281cd6552 refactor(prometheus): use values ns var
!1
2023-09-21 12:53:34 +09:30
Jon
3f2204cf31 feat(deployment): grafana-agent dnspoly clusterfirst
!1
2023-09-21 01:02:41 +09:30
Jon
cbac640af3 feat(promtail): filter devices to not include temp/virtual
!1
2023-09-21 01:02:00 +09:30
Jon
64d612047f refactor: use loki ns var
!1
2023-09-21 01:01:25 +09:30
Jon
4e742ee2d5 feat(metrics): node scraper set to 5s
!1
2023-09-21 01:00:53 +09:30
Jon
4ced83b90d feat(loik): be able to specify full loki url 2023-09-19 23:20:57 +09:30
Jon
a28126555f feat(loki): add prometheus alerts and rules for mixins
!1
2023-09-19 23:19:45 +09:30
Jon
aba3cb22a9 feat(loki): removed service monitor
a service monitor is included with loki

!1
2023-09-19 23:19:14 +09:30
Jon
63c0381bd1 feat(metrics): Calico service and monitor added
!1
2023-09-19 23:18:27 +09:30
Jon
9e3f33ae56 feat(node): added dashboard
!1
2023-09-19 19:02:32 +09:30
Jon
1dc85b4518 feat(ceph): added dashboard
!1
2023-09-19 19:02:17 +09:30
Jon
a615b4922f feat(alertmanager): added dashboard
!1
2023-09-19 19:02:04 +09:30
Jon
6ffcc76acf docs: added initial docs
!1
2023-09-19 17:01:02 +09:30
Jon
8f49ab08b4 feat(grafana): loki data source
to enable use feature flag in values

!1
2023-09-19 16:29:07 +09:30
Jon
6c8298b0bb feat(metrics): loki service monitor
to enable use feature flag in values

!1
2023-09-19 16:14:54 +09:30
Jon
16965c7304 feat(metrics): ceph prometheus rules
to enable use feature flag in values

!1
2023-09-19 16:11:21 +09:30
Jon
fe2cc1352a feat(metrics): ceph service monitor
to enable use feature flag in values

!1
2023-09-19 16:03:13 +09:30
Jon
38730de0a7 chore: initial chart and values yaml
!1
2023-09-19 15:25:36 +09:30
Jon
9880c5d988 feat(metrics): Kubeprometheus Prometheus rules
!1
2023-09-19 15:23:42 +09:30
Jon
b5d7a24d28 feat(metrics): Kubernetes control plane Prometheus rules
!1
2023-09-19 15:23:22 +09:30
Jon
fe28c6a24c feat(metrics): Node-Exporter Prometheus rules
!1
2023-09-19 15:22:55 +09:30
Jon
59316081cd feat(metrics): Kubelet service monitor
!1
2023-09-19 15:22:15 +09:30
Jon
8699d66347 feat(metrics): CoreDNS service monitor
!1
2023-09-19 15:22:03 +09:30
Jon
bf44961e02 feat(metrics): CAdvisor service monitor
!1
2023-09-19 15:21:50 +09:30
Jon
8631e56028 feat(metrics): api service monitor
!1
2023-09-19 15:21:25 +09:30
Jon
53151ac6b9 feat(grafana): alert manager data source
!1
2023-09-19 15:20:42 +09:30
Jon
762a9fa387 feat: proxy deployment for kube scheduler controller metrics
!1
2023-09-19 15:19:47 +09:30
Jon
81136232f7 feat: kubestate metrics deployment
!1
2023-09-19 15:17:44 +09:30
Jon
534377edbd feat: alert manager deployment
!1
2023-09-19 15:16:14 +09:30
Jon
bb05212755 feat(service_monitor): use grafana agent for node node-exporter
!1
2023-09-19 15:13:11 +09:30
Jon
955515de35 feat(service_monitor): use grafana agent for node exporter
!1
2023-09-19 15:12:55 +09:30
Jon
e0bc34c12f feat: grafana agent deployment
!1
2023-09-19 15:12:04 +09:30
Jon
3865028541 feat(grafana): prometheus rules
!1
2023-09-19 15:09:59 +09:30
Jon
5d808ee753 feat: prometheus adaptor deployment
!1
2023-09-19 15:09:20 +09:30
Jon
8fff819081 feat: prometheus deployment
!1
2023-09-19 15:07:49 +09:30
Jon
286d2827ad feat: grafana deployment
!1
2023-09-19 15:05:49 +09:30
Jon
0feb8794b2 ci: typo in github url
!1
2023-09-19 15:00:58 +09:30
Jon
df126a576d feat: initial template, docs and ci
!1
2023-09-19 14:58:58 +09:30
113 changed files with 9767 additions and 0 deletions

7
.cz.yaml Normal file
View File

@ -0,0 +1,7 @@
commitizen:
bump_message: "build(version): bump version $current_version \u2192 $new_version"
changelog_incremental: false
name: cz_conventional_commits
tag_format: $major.$minor.$patch$prerelease
update_changelog_on_bump: true
version: 0.2.0

35
.gitlab-ci.yml Normal file
View File

@ -0,0 +1,35 @@
variables:
MY_PROJECT_ID: "50510268"
GIT_SYNC_URL: "https://$GITHUB_USERNAME_ROBOT:$GITHUB_TOKEN_ROBOT@github.com/NoFussComputing/kubernetes_monitoring.git"
PAGES_ENVIRONMENT_PATH: projects/kubernetes_monitoring/
include:
- project: nofusscomputing/projects/gitlab-ci
ref: development
file:
- template/automagic.gitlab-ci.yaml
#- template: Jobs/Container-Scanning.gitlab-ci.yml # see https://gitlab.com/gitlab-org/gitlab/-/issues/381665
Website.Submodule.Deploy:
extends: .submodule_update_trigger
variables:
SUBMODULE_UPDATE_TRIGGER_PROJECT: nofusscomputing/infrastructure/website
environment:
url: https://nofusscomputing.com/$PAGES_ENVIRONMENT_PATH
name: Documentation
rules:
- if: # condition_dev_branch_push
$CI_COMMIT_BRANCH == "development" &&
$CI_PIPELINE_SOURCE == "push"
exists:
- '{docs/**,pages/**}/*.md'
changes:
paths:
- '{docs/**,pages/**}/*.md'
compare_to: 'master'
when: always
- when: never

8
.gitmodules vendored Normal file
View File

@ -0,0 +1,8 @@
[submodule "gitlab-ci"]
path = gitlab-ci
url = https://gitlab.com/nofusscomputing/projects/gitlab-ci.git
branch = development
[submodule "website-template"]
path = website-template
url = https://gitlab.com/nofusscomputing/infrastructure/website-template.git
branch = development

30
.helmignore Normal file
View File

@ -0,0 +1,30 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
# git repo files
monitoring-mixins/
ceph-mixins/
gitlab-ci/
website-template/
docs/

10
.nfc_automation.yaml Normal file
View File

@ -0,0 +1,10 @@
---
role_git_conf:
gitlab:
submodule_branch: "development"
default_branch: development
mr_labels: ~"type::automation" ~"impact::0" ~"priority::0"
auto_merge: true
merge_request:
patch_labels: '~"code review::not started"'

119
CHANGELOG.md Normal file
View File

@ -0,0 +1,119 @@
## 0.2.0 (2023-11-19)
### Bug Fixes
- **ingress**: [11563741](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/11563741a90dc81ddf043cc529fa13916bee9082) - underscore removed from ingress name [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/3) ]
- **ingress**: [33883ec6](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/33883ec64025d8a2ab81fec50019d2605776ea07) - function bool not defined [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/3) ]
- **grafana**: [593d73e4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/593d73e4c6e5c029d592adc8af5173c92d6a6fb0) - PVC access mode default to ReadWriteOnce [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **prometheus**: [b6d0b18e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/b6d0b18ece17dcd2733c6892cba7e8e83575fab0) - use correct variable for storage values [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
### Features
- **prometheus**: [562bb7d6](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/562bb7d6f57f52b3cf4fcaf19a15a8eed492d15e) - added config for ingress [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/3) ]
- **grafana**: [26a78bdb](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/26a78bdb3373cbed1d78785ded47e3dfae28c853) - added config for ingress [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/3) ]
- **alert_manager**: [65534660](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/65534660dda9b10761bf6067abc3297bec75a182) - added config for ingress [ [!5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/5) [!3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/3) ]
## 0.2.0rc0 (2023-11-06)
### Code Refactor
- **grafana_datasources**: [cf38a415](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/cf38a4156adcbdda150fe309f1f7dd97a5a3bf07) - default prom set to thanos [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
### Features
- **Grafana**: [b047f33e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/b047f33e275d2e908f7e8a220944573f2e367bf7) - Configure grafana from values.yaml [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **GrafanaAgent**: [0c296c17](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/0c296c173e786c77504fabba337b464bcde6c290) - Network Policy added [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **grafana**: [7164bb2e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/7164bb2e0b5f2c5e7fb59bcb64321eabb651b08e) - Network Policy added [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **prometheus**: [ae35e704](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/ae35e704e33bee4d2fea3390a7965a087e30acce) - Network Policy added [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **NetworkPolicy**: [5c62055d](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/5c62055d2f2041b33a2ab5ff8023cf8bd6b08df1) - configure from values.yaml [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **labels**: [03affbee](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/03affbee809d9e36f0930ae49ab07191e2b645a2) - lables moved to values.yaml [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **graph**: [65b61ecf](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/65b61ecf545800bcbf4efc7e3175951b9c6d2965) - add ceph size/used to dashboard [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **prometheus**: [7afc6aa5](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/7afc6aa515db4a43ce8dc1a152f45f400fc68a39) - rule selector configurable from values.yaml [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **promtail**: [2371adcb](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/2371adcbc5d64a27ff72b0ac45f589ccef4eb400) - split node to seperate promtail service monitor [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **grafana_agent**: [f73a9e46](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/f73a9e462e16dffbc0e17eaed3c0c78aff95d52b) - add prom rules [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **prometheus**: [5bb71971](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/5bb71971292b8f760e9b348652dc9df3c0dfa921) - tsdb retention set [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
- **thanos**: [be016187](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/be0161876d6b6868204af06255a01f08cc62c6ea) - prometheus rules added for sidecar [ [!4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/4) ]
## 0.1.0 (2023-09-27)
### Bug Fixes
- **kubestateproxy**: [b7cfebf9](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/b7cfebf92b3983f70df3439a516e0e355a505332) - bump proxy cpu limit [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [a26f887f](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/a26f887fa41ef0eb0ff6ade8584335e228442d6a) - use named pvc so it's reused [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **ceph**: [944d6153](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/944d6153312703f41f5d579b3326684c0f7312a7) - PromRule CephPGImbalance adjusted to group by node [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [89f0feae](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/89f0feae08bde0e1545c492b58dd1fbdb9ac3a45) - dont use operator readiness prob [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
### Code Refactor
- **dashboard**: [ffc0f846](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/ffc0f8466912b5598d7df9078ce30c8a3424fc34) - nfc cluster overview to use source variable [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) ]
- **prometheus**: [899c6a3d](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/899c6a3d786e147024bd63464e89637a7180b909) - adjusted role/rolbinding manifest to loop [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana_agent**: [817b8386](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/817b83865555cbfa49dad03b2beffa2a93c45124) - clean up config file [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [7add74fb](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/7add74fbb1cae8613df3a7405889198c78b091f0) - yaml object ordering [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **prometheus**: [3281cd65](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3281cd6552339ee16c7ae94ae9676f26c3d8000e) - use values ns var [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [64d61204](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/64d612047f35349d149c8c0d4d1732908284ca9f) - use loki ns var [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
### Continious Integration
- [0feb8794](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/0feb8794b2bcc41357ff8d8bf30f1856eb193e7d) - typo in github url [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
### Documentaton / Guides
- [c76b7536](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/c76b753618b19c8fe088bc828247eb16db564c60) - update [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) ]
- **metrics_storage**: [6dd413a0](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/6dd413a0d42cdaf53bd3034645be60eb0b388afe) - added docs on how to configure external metrics routing [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
- [d4e529ae](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/d4e529aec9c9d9d7a0982c061f44f853de740fd6) - add more features [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [82cd91a8](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/82cd91a8596afca592292840cd6073ba5cb1e18b) - added missing attribution [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [4198ac78](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/4198ac78da4733b36b94587dce9fd0a19664106d) - add blurb on values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [6ffcc76a](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/6ffcc76acf44e3fa124d21a5f2c3b6d169d342b3) - added initial docs [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
### Features
- **grafana**: [5add73c4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/5add73c411a595848d4c0714bab9aac3dc034118) - DataSources configurable in values.yaml [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
- **prometheus**: [239837ec](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/239837ecbf8f5b233fcec418d063cd16b930a4f1) - ability to add additional config settings [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
- **prometheus**: [ac812c2c](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/ac812c2c7ba6de7dd674088c5c5e2ebe29dc14a6) - thanos sidecar option added [ [!2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/2) [#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1) ]
- **grafana_dashboard**: [e7480105](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e7480105f7d0397bb20656a1d9c623461e408a6b) - nfc custom, cluster overview [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **kyverno**: [c8ea9298](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/c8ea92987318f10a5c14af59ac4d45bc8a549061) - add clusterpolicy role and rolebinding [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana_agent**: [e3648324](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e3648324f234c82202999e3198ba3a493384c1e4) - attach hostname to logs and metrics [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [8f53b0fa](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8f53b0fa07eef6153e4583f6fe86f3f7fd5027f4) - sidecar for loading dashboards from configmap [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **servicemonitor**: [3a4ca30e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3a4ca30e68c85758fbfa1325a7fc6c17c81926ad) - ceph use cluster name for instance [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **servicemonitor**: [e2540a95](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e2540a955f08b7dcafa159abb773f095b920738b) - for prometheus, use pod name for instance [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [0d1d0a34](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/0d1d0a34d8738ae408226479e0249f5cd1873551) - add affinity to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **prometheus**: [83fb30b4](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/83fb30b4ed757d50b9f53bf3eba3de9cc38e0b55) - add storage to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **prometheus_adaptor**: [c742eea3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/c742eea38d8586f3de42db377f953b35c593966c) - add affinity to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **prometheus**: [836cc111](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/836cc111e092046988aba7b9b1494b1432313add) - add affinity to values [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **logging**: [2331bcbb](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/2331bcbba35d561b22b35f7aa8c19cea087999fd) - add log files from /var/log [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana-agent**: [8b966b0f](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8b966b0f0ac33571128c959ed02d985cc805668f) - expand env vars in config [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **deployment**: [3f2204cf](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3f2204cf3132deea4e4750ec7cf97e512986423e) - grafana-agent dnspoly clusterfirst [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **promtail**: [cbac640a](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/cbac640af3ffdb16c6b78ecfcf86205b7804e0fd) - filter devices to not include temp/virtual [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [4e742ee2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/4e742ee2d5dab3e2781d62e075f4a8adcf9c6b2a) - node scraper set to 5s [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **loik**: [4ced83b9](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/4ced83b90d98e4805550cb2b5fda55c705e8c8bb) - be able to specify full loki url
- **loki**: [a2812655](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/a28126555f594aa805016c5948151e860e5c93ad) - add prometheus alerts and rules for mixins [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **loki**: [aba3cb22](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/aba3cb22a97d180185eef2350988720542cb1298) - removed service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [63c0381b](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/63c0381bd15b077537667979ec8e3dd2e6503090) - Calico service and monitor added [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **node**: [9e3f33ae](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/9e3f33ae5679e92b1d5097fb808943c59f31aa3b) - added dashboard [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **ceph**: [1dc85b45](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/1dc85b45180b56c7ed8f2a27246f0771f8672939) - added dashboard [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **alertmanager**: [a615b492](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/a615b4922f59024c17056f6f6bc802ff2188c787) - added dashboard [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [8f49ab08](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8f49ab08b4759921b6577384b5456940d5026288) - loki data source [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [6c8298b0](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/6c8298b0bb57e00f7aa7ee0deae7d9175c9a5347) - loki service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [16965c73](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/16965c73046c4a4d463b5a36e246920fcad3f3fe) - ceph prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [fe2cc135](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/fe2cc1352a1bd5dd49f2d3a3ac64b12916ad9dbf) - ceph service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [9880c5d9](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/9880c5d988fce19aa59aa143c3c2c6e9cf5df88b) - Kubeprometheus Prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [b5d7a24d](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/b5d7a24d28f9f418d9bca5e47d64c18cd9309986) - Kubernetes control plane Prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [fe28c6a2](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/fe28c6a24cc786fbcc818f21f6dd971b7776110a) - Node-Exporter Prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [59316081](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/59316081cd02676b25e88c79c99bc142918b8c07) - Kubelet service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [8699d663](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8699d66347106ee85ac775acc625f069494b02b7) - CoreDNS service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [bf44961e](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/bf44961e02aa076f4dc0a5241359372d0e391b37) - CAdvisor service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **metrics**: [8631e560](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8631e56028e05b7c8aab6d239114d1c3b3304862) - api service monitor [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [53151ac6](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/53151ac6b960dff80e6168486ff3a0359745347c) - alert manager data source [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [762a9fa3](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/762a9fa387be7e0cf9919075fd94705940a92a0b) - proxy deployment for kube scheduler controller metrics [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [81136232](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/81136232f773429b2e26ecedc79a2a660b2d9abc) - kubestate metrics deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [534377ed](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/534377edbdafab9c3939288330fc3e845445c898) - alert manager deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **service_monitor**: [bb052127](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/bb0521275581f44f6000829316f0660cde52c5ec) - use grafana agent for node node-exporter [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **service_monitor**: [955515de](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/955515de351d3b725bfbe8b3e7cb5f8edd328cb5) - use grafana agent for node exporter [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [e0bc34c1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/e0bc34c12ff88b8e75806dc238b00e1ef464129b) - grafana agent deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- **grafana**: [38650285](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/3865028541d6023f380745a2549654c651431a1b) - prometheus rules [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [5d808ee7](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/5d808ee753afff17373dd8c4232b00bbf8904f16) - prometheus adaptor deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [8fff8190](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/8fff819081bf0b274f0afe7fe9f5cee6cd85ef18) - prometheus deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [286d2827](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/286d2827ad1c30f95814494425bcda96a8a9874d) - grafana deployment [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
- [df126a57](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/commit/df126a576d6fb81ae7c4caa07154502b5d119f0f) - initial template, docs and ci [ [!1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/merge_requests/1) ]
## 0.0.1 (2023-09-27)

26
Chart.yaml Normal file
View File

@ -0,0 +1,26 @@
apiVersion: v2
name: nfc-monitoring
description: |
This helm chart installs the required components for full stack monitoring.
The purpose of this chart is to provide all components required to monitor kubernetes within its entirety.
Storage of the monitoring data is not part of this chart nor will it be included. For this reason this chart can be added and removed without loosing your monitoring data.
For this chart to function correctly the following operators are required:
- [grafana](https://grafana-operator.github.io/grafana-operator/docs/installation/helm/)
- [prometheus](https://prometheus-operator.dev/docs/user-guides/getting-started/)
Full Docs available at http://nofusscomputing.com/projects/kubernetes_monitoring.
type: application
version: 0.1.0
appVersion: "0.1.0"
dependencies: []

View File

@ -49,6 +49,10 @@ All contributions for this project must conducted from [Gitlab](https://gitlab.c
For further details on contributing please refer to the [contribution guide](CONTRIBUTING.md).
## Documentation
Documentation for this project can be found at <http://nofusscomputing.com/projects/kubernetes_monitoring>
## Other
This repo is release under this [license](LICENSE)

0
docs/articles/index.md Normal file
View File

0
docs/contact.md Normal file
View File

0
docs/index.md Normal file
View File

0
docs/operations/index.md Normal file
View File

0
docs/projects/index.md Normal file
View File

View File

@ -0,0 +1,201 @@
---
title: Kubernetes Monitoring Helm Chart
description: How to use No Fuss Computings kubernetes monitoring helm chart for full stack monitoring.
date: 2023-09-19
template: project.html
about: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring
---
This Helm chart deploys the components needed for full stack monitoring.
What this chart is:
- Collectors for metrics and logging
- Distribution / Routing of metrics/logging
- Data visualization
- Ready for logging long term storag integration, in particular Loki
- Ready for metrics long term storage integration
What this chart is not:
- A complete monitoring stack inclusive of long term storage.
Intentionally we have kept this helm chart to the monitoring components only. Keeping the monitoring components seperate from storage is advantageous. for all intents and purposes this helm chart is ephemeral.
This helm chart started off with components from multiple open-source projects. As such attribution is warrented, so head on over to thier projects and give them a star. Projects used to create the building blocks of this helm chart are
- [ceph](https://github.com/ceph/ceph) _mixin alerts/rules_
- [Kube prometheus](https://github.com/prometheus-operator/kube-prometheus)
- [prometheus adaptor](https://github.com/kubernetes-sigs/prometheus-adapter)
## Features
- Alert Manager
- Grafana as the visualization frontend
- Dashboards
- Ceph
- Cluster Overview
- Node Exporter Full
- Alert Manager
- Sidecar to load dashboards from ConfigMaps _Optional_
- Grafana-Agent
- daemonset for cluster nodes for exposing metrics and an injecter to loki for node and container logs.
- Loki integration for storing logs
- [Mixins](https://github.com/monitoring-mixins/website) Compatible _(partial)_, Tested with:
- alert-manager
- ceph
- coreDNS
- Kubernetes
- Loki [helm chart](https://artifacthub.io/packages/helm/grafana/loki) included dashboards, see [loki repo](https://github.com/grafana/loki/tree/f4ab1e3e89ac66e1848764dc17826abde929fdc5/production/loki-mixin-compiled) for dashboards_
- Node-Exporter
- Prometheus
- Promtail
- Prometheus
- Thanos sidecar _Optional_
- Service monitors
- API Server
- CAdvisor
- Calico, _Optional_
> enable calico metrics with `kubectl patch felixconfiguration default --type merge --patch '{"spec":{"prometheusMetricsEnabled": true}}'`
- Ceph _Optional_
- CoreDNS
- Grafana
- Kube-Controler-Manager
- Kube-Scheduler
- Kubelet
- Kube-state-metrics
- Node
- Node-Exporter
- Prometheus
- Prometheus-Adaptor
- Promtail
- Thanos
- kyverno policies _(optional, set in values.yaml)_
- auto deploy policy for prometheus role and rolebinding to access other namespaces
- `values.yaml` customization of components. _See values.yaml for more details._
## Installation
This chart as the following **dependencies:**
- [Grafana Operator](https://artifacthub.io/packages/olm/community-operators/grafana-operator)
- [Prometheus Operator](https://artifacthub.io/packages/olm/community-operators/prometheus)
These dependencies must be present before installing this chart. to install run the following commands:
``` bash
git clone -b development https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring.git
helm upgrade -i nfc_monitoring kubernetes_monitoring/
```
## Metrics Storage
There are two different ways that this chart porvides to route your metrics to long-term storage.
- Prometheus `remote_write`
- Thanos Side Car container
### Prometheus Remote Write
Using this method will have prometheus pushing it's metrics to another service to store its metrics. This option for example could be used to push metrics to Grafana Mimir. To configure add the following to the values.yaml file at path `nfc_monitoring.prometheus.additional`:
``` yaml
remoteWrite:
- name: mimir
url: http://mimir-gateway.metrics.svc.cluster.local/api/v1/push
```
Ensure that you set the url to the correct url.
### Thanos Side Car
This method will have a Thanos container deployed within the prometheus pod. This container will then read and upload the Prometheus containers tsdb to the configured storage. To configure add the following to the values.yaml
- `monitoring.thanos.sidecar.enabled` set to bool `true`
- `nfc_monitoring.thanos.sidecar.config` updated to include the sidecar config.
For example to configure the Thanos sideCar to upload Prometheus metrics to S3 storage use (updating values to suit your environment):
``` yaml
type: S3
config:
bucket: "thanos-metrics"
endpoint: "rook-ceph-rgw-earth.ceph.svc:80"
access_key: "7J5NM2MNCDB4T4Y9OKJ5"
secret_key: "t9r69RzZdWEBL3NCKiUIpDk6j5625xc6HucusiGG"
```
## Template Values
The values file included in this helm chart is below.
!!! note
This values file is from the development branch and as such may not reflect the current version you have deployed. If you require a specific version, head on over to the git repository and select the git tag that matches the version you are after.
``` yaml title="values.yaml" linenums="1"
--8<-- "values.yaml"
```

0
docs/tags.md Normal file
View File

87
docs/task-doc-template.md Normal file
View File

@ -0,0 +1,87 @@
short summary of the task file
## {Task Name}
- **Name**:
- **Description**:
- **Module**:
- **Arguments**:
-
- **Conditional**:
- **Tags**:
-
## {Task Name}
- **Name**:
- **Description**:
- **Module**:
- **Arguments**:
-
- **Registers**:
- **Conditional**:
- **Tags**:
-
## Variables
The following variables can be customized in this task file:
```yaml
variable_name: "default_value"
```
- `variable_name`: Description of the variable.
## Tags
The tasks in this task file are tagged with the following tags:
-
## Usage
To use this Ansible task file, you can include it in your playbook or role and provide values for the required variables. Here's an example of how you can use this task file:
1. Create a playbook (e.g., `your_playbook.yaml`) and define the necessary variables:
```yaml
---
- hosts: your_hosts
vars:
variable_name: "value"
tasks:
- include_tasks: path/to/task_file.yaml
```
2. Create a separate file for the task file (e.g., `task_file.yaml`) and copy the content of the task file into it.
3. Run the playbook:
```shell
ansible-playbook your_playbook.yaml
```
Make sure to replace the placeholder values (`variable_name`, `value`) with the appropriate values for your setup.
Note: You may need to adjust the playbook structure and additional tasks based on your specific requirements and the tasks you want to execute.

1395
files/dashboard-summary.json Normal file

File diff suppressed because it is too large Load Diff

1
gitlab-ci Submodule

Submodule gitlab-ci added at 224ef83157

30
mkdocs.yml Normal file
View File

@ -0,0 +1,30 @@
INHERIT: website-template/mkdocs.yml
docs_dir: 'docs'
repo_name: Kubernetes Monitoring Helm Chart
repo_url: https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring
edit_uri: '/-/ide/project/nofusscomputing/projects/kubernetes_monitoring/edit/development/-/docs/'
nav:
- Home: index.md
- Articles:
- articles/index.md
- Projects:
- projects/index.md
- Kubernetes Monitoring:
- projects/kubernetes_monitoring/index.md
- Operations:
- operations/index.md
- Contact Us: contact.md

View File

@ -0,0 +1,23 @@
{{ if false }}
# already on k3s
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: prometheus-adapter
namespace: monitoring
version: v1beta1
versionPriority: 100
{{ end }}

View File

@ -0,0 +1,38 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: {{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
spec:
image: "{{ .Values.nfc_monitoring.alert_manager.image.name }}:{{ .Values.nfc_monitoring.alert_manager.image.tag }}"
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
replicas: {{ .Values.nfc_monitoring.alert_manager.replicas }}
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: 0.25.0

View File

@ -0,0 +1,74 @@
{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }}
---
apiVersion: kyverno.io/v1
kind: ClusterPolicy
metadata:
name: add-prometheus-role
annotations:
policies.kyverno.io/title: Add Prometheus Role
policies.kyverno.io/category: Monitoring
policies.kyverno.io/subject: RoleBinding
policies.kyverno.io/minversion: 1.6.0
policies.kyverno.io/description: >-
This policy is responsible for ensuring that a Role for the prometheus
monitoring instances is created to enable monitoring of the namespace in
question.
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
background: true
generateExisting: true
rules:
- name: generate-prometheus-role
match:
any:
- resources:
kinds:
- Namespace
generate:
synchronize: true
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
name: prometheus-{{ $.Release.Name }}
namespace: "{{ `{{` }}request.object.metadata.name }}"
data:
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 14 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
{{ end }}

View File

@ -0,0 +1,53 @@
{{ if .Values.nfc_monitoring.prometheus.kyverno_role_policy }}
---
apiVersion: kyverno.io/v1
kind: ClusterPolicy
metadata:
name: add-prometheus-role-binding
annotations:
policies.kyverno.io/title: Add Prometheus RoleBinding
policies.kyverno.io/category: Monitoring
policies.kyverno.io/subject: RoleBinding
policies.kyverno.io/minversion: 1.6.0
policies.kyverno.io/description: >-
This policy is responsible for ensuring that a RoleBinding for the prometheus
monitoring instances is created to enable monitoring of the namespace in
question.
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
background: true
generateExisting: true
rules:
- name: generate-prometheus-binding
match:
any:
- resources:
kinds:
- Namespace
generate:
synchronize: true
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
name: prometheus-{{ $.Release.Name }}
namespace: "{{ `{{` }}request.object.metadata.name }}"
data:
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 14 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-{{ $.Release.Name }}
subjects:
- kind: ServiceAccount
name: prometheus-{{ $.Release.Name }}
namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}"
{{ end }}

View File

@ -0,0 +1,41 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: grafana-agent
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
- events
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
verbs:
- get
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create

View File

@ -0,0 +1,29 @@
{{ if false }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"
name: system:aggregated-metrics-reader
namespace: monitoring
rules:
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch
# Already exists on k3s
{{ end }}

View File

@ -0,0 +1,20 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: resource-metrics:system:auth-delegator
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,19 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: hpa-controller-custom-metrics
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: custom-metrics-server-resources
subjects:
- kind: ServiceAccount
name: horizontal-pod-autoscaler
namespace: kube-system

View File

@ -0,0 +1,20 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-adapter
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-adapter
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,19 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: Grafana-Sidecar
rules:
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get", "watch", "list"]
{{- end }}
{{- end }}

View File

@ -0,0 +1,24 @@
{{ if .Values.nfc_monitoring.kube_monitor_proxy.enabled }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-monitor-proxy
labels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
rules:
- apiGroups: ["authentication.k8s.io"]
resources:
- tokenreviews
verbs: ["create"]
- apiGroups: ["authorization.k8s.io"]
resources:
- subjectaccessreviews
verbs: ["create"]
{{ end }}

View File

@ -0,0 +1,23 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring
{{ end }}

View File

@ -0,0 +1,135 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: kube-state-metrics
rules:
- apiGroups:
- ""
resources:
- configmaps
- secrets
- nodes
- pods
- services
- serviceaccounts
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs:
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- batch
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch
- apiGroups:
- certificates.k8s.io
resources:
- certificatesigningrequests
verbs:
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
- ingressclasses
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
- apiGroups:
- rbac.authorization.k8s.io
resources:
- clusterrolebindings
- clusterroles
- rolebindings
- roles
verbs:
- list
- watch
{{ end }}

View File

@ -0,0 +1,18 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: resource-metrics-server-resources
rules:
- apiGroups:
- metrics.k8s.io
resources:
- '*'
verbs:
- '*'

View File

@ -0,0 +1,23 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-adapter
rules:
- apiGroups:
- ""
resources:
- nodes
- namespaces
- pods
- services
verbs:
- get
- list
- watch

View File

@ -0,0 +1,22 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
rules:
- apiGroups:
- ""
resources:
- nodes/metrics
verbs:
- get
- nonResourceURLs:
- /metrics
verbs:
- get

View File

@ -0,0 +1,19 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: grafana-agent
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: grafana-agent
subjects:
- kind: ServiceAccount
name: grafana-agent
namespace: monitoring

View File

@ -0,0 +1,23 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}}
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: Grafana-Sidecar
roleRef:
kind: ClusterRole
name: Grafana-Sidecar
apiGroup: rbac.authorization.k8s.io
subjects:
- kind: ServiceAccount
name: grafana
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
{{- end }}
{{- end }}

View File

@ -0,0 +1,20 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-monitor-proxy
labels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-monitor-proxy
subjects:
- kind: ServiceAccount
name: kube-monitor-proxy
namespace: monitoring

View File

@ -0,0 +1,19 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-{{ $.Release.Name }}
subjects:
- kind: ServiceAccount
name: prometheus-{{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}

View File

@ -0,0 +1,128 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-config
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
data:
prometheus.yaml: |-
{
"apiVersion": 1,
"datasources": [
{
"access":"proxy",
"editable": true,
"name": "prometheus",
"orgId": 1,
"type": "prometheus",
"url": "http://prometheus-config-map.monitoring.svc:9090",
"version": 1
}
]
}
oncall-app.yaml: |-
apiVersion: 1
apps:
- type: grafana-oncall-app
name: grafana-oncall-app
disabled: false
jsonData:
#user: admin
grafanaUrl: http://grafana.monitoring.svc:3000
onCallApiUrl: http://oncall.monitoring.svc:8080
secureJsonData:
#password: admin
onCallInvitationToken: yaD3a6dkYV0CFinTlMPb42Mi9
#http://127.0.0.1:8080/integrations/v1/alertmanager/yaD3a6dkYV0CFinTlMPb42Mi9/
---
apiVersion: v1
kind: ConfigMap
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: grafana-alertmanager
namespace: monitoring
data:
alertmanager.yaml: |-
{
"template_files": {},
"template_file_provenances": {},
"alertmanager_config": {
"route": {
"receiver": "Grafana Alerting 😊",
"group_by": [
"grafana_folder",
"alertname"
],
"routes": [
{
"receiver": "Grafana Alerting 😊",
"object_matchers": [
[
"severity",
"!=",
""
],
[
"severity",
"=",
"critical"
],
[
"severity",
"=",
"info"
],
[
"severity",
"=",
"warning"
]
],
"continue": true,
"group_wait": "1s",
"group_interval": "2s",
"repeat_interval": "15m"
}
],
"group_wait": "1s",
"group_interval": "2s",
"repeat_interval": "3m"
},
"templates": null,
"receivers": [
{
"name": "Grafana Alerting 😊",
"grafana_managed_receiver_configs": [
{
"uid": "4dqvJSfVkz",
"name": "Grafana Alerting 😊",
"type": "webhook",
"disableResolveMessage": false,
"settings": {
"httpMethod": "POST",
"url": "http://host.docker.internal:8080/integrations/v1/grafana_alerting/dXStYoK9Z15VZW8R8AtfyIwtu/"
},
"secureFields": {}
}
]
}
]
}
}
{{- end }}

View File

@ -0,0 +1,288 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: grafana-agent
namespace: monitoring
data:
agent.yaml: |
metrics:
wal_directory: /tmp/wal
{{ if .Values.nfc_monitoring.loki.config }}
logs:
positions_directory: "/tmp"
configs:
- name: node-logs
clients:
- url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push
backoff_config:
min_period: 10s
max_period: 5m
max_retries: 10
limits_config:
readline_rate_enabled: true
readline_rate: 250
readline_burst: 750
readline_rate_drop: false
max_streams: 500
scrape_configs:
- job_name: systemd-journal
journal:
path: /host/root/run/log/journal
json: true
max_age: 24h
relabel_configs:
- source_labels:
- __journal__systemd_unit
target_label: systemd_unit
- source_labels:
- __journal__hostname
target_label: node
- source_labels:
- __journal_syslog_identifier
target_label: syslog_identifier
- target_label: "job"
replacement: "systemd-journal"
- target_label: "job_name"
replacement: "journal-logs"
- target_label: hostname
replacement: "${HOSTNAME}"
pipeline_stages:
- json:
expressions:
pid: _PID
userId: _UID
application: _COMM
priority: PRIORITY
- labels:
application:
#level:
pid:
userId:
priority:
- template:
source: level
template: '{{"{{"}} ToLower .Value {{"}}"}}'
- match:
selector: '{priority="7"}'
stages:
- template:
source: level
template: 'debug'
- match:
selector: '{priority="6"}'
stages:
- template:
source: level
template: 'info'
- match:
selector: '{priority="5"}'
stages:
- template:
source: level
template: 'notice'
- match:
selector: '{priority="4"}'
stages:
- template:
source: level
template: 'warning'
- match:
selector: '{priority="3"}'
stages:
- template:
source: level
template: 'error'
- match:
selector: '{priority="2"}'
stages:
- template:
source: level
template: 'crit'
- match:
selector: '{priority="1"}'
stages:
- template:
source: level
template: 'alert'
- match:
selector: '{priority="0"}'
stages:
- template:
source: level
template: 'emerg'
- labels:
level:
- job_name: var-logs
static_configs:
- targets: [localhost]
labels:
job_name: 'aptitude'
__path__: /var/log/apt/*.log
- targets: [localhost]
labels:
job_name: 'dpkg'
__path__: /var/log/dpkg.log
- targets: [localhost]
labels:
job_name: 'messages'
__path__: /var/log/messages
- targets: [localhost]
labels:
job_name: 'syslog'
__path__: /var/log/syslog
- targets: [localhost]
labels:
job_name: 'kernlog'
__path__: /var/log/kern.log
- targets: [localhost]
labels:
job_name: 'auth-log'
__path__: /var/log/auth.log
- targets: [localhost]
labels:
job_name: 'boot-log'
__path__: /var/log/boot.log
- targets: [localhost]
labels:
job_name: 'daemon-log'
__path__: /var/log/daemon.log
- targets: [localhost]
labels:
job_name: 'faillog-log'
__path__: /var/log/faillog
- targets: [localhost]
labels:
job_name: 'lastlog-log'
__path__: /var/log/lastlog
relabel_configs:
- target_label: hostname
replacement: "${HOSTNAME}"
- target_label: job
replacement: log-files
- target_label: node
replacement: "${HOSTNAME}"
pipeline_stages:
- timestamp:
format: RFC3339
source: time
- job_name: pod-logs
kubernetes_sd_configs:
- role: pod
pipeline_stages:
- cri: {}
- regex:
expression: '(\s|\t|level=)?(?P<level>trace|debug|info|warn|error|TRACE|DEBUG|INFO|WARN|ERROR)(\s|\t)'
- labels:
level:
relabel_configs:
- source_labels:
- __meta_kubernetes_pod_node_name
target_label: __host__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- target_label: job
replacement: kubernetes_sd
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: replace
source_labels:
- __meta_kubernetes_pod_container_name
target_label: container
- replacement: /var/log/pods/*$1/*.log
separator: /
source_labels:
- __meta_kubernetes_pod_uid
- __meta_kubernetes_pod_container_name
target_label: __path__
- target_label: "job_name"
replacement: "kubernetes-logs"
- target_label: hostname
replacement: "${HOSTNAME}"
- target_label: node
source_labels:
- __meta_kubernetes_pod_node_name
{{ end }}
integrations:
node_exporter:
enabled: true
rootfs_path: /host/root
sysfs_path: /host/sys
procfs_path: /host/proc
udev_data_path: /host/root/run/udev/data
# collector.filesystem.ignored-mount-points: ^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+|/run/containerd/io.containerd.+)($|/)
filesystem_mount_points_exclude: "^/(dev|proc|sys|var/lib/docker/.+|/run/containerd/io.containerd.+)($|/)"
#filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|shm|squashfs|sysfs|tracefs)$"
filesystem_fs_types_exclude: "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|ugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|ocfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
netclass_ignored_devices: "^(veth.*|cali.*|[a-f0-9]{15})$"
netdev_device_exclude: "^(veth.*|cali.*|[a-f0-9]{15})$"
include_exporter_metrics: true
enable_collectors:
- uname
syslog_server.yaml: |
# REF: https://grafana.com/docs/loki/latest/send-data/promtail/configuration/#example-syslog-config
{{ if .Values.nfc_monitoring.loki.config }}
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/syslog_server_positions.yaml
clients:
- url: http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}/loki/api/v1/push
scrape_configs:
- job_name: syslog
syslog:
listen_address: 0.0.0.0:1514
labels:
job: "syslog"
relabel_configs:
- source_labels: ['__syslog_message_hostname']
target_label: 'host'
{{ end }}

View File

@ -0,0 +1,32 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled -}}
---
# Provisioning config
apiVersion: v1
kind: ConfigMap
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: provisioning-config
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
data:
provisioning.yaml: |-
apiVersion: 1
providers:
- name: 'configmap-dashboard-provider'
orgId: 1
folder: ''
folderUid: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: false
options:
path: /var/lib/grafana/dashboards
foldersFromFilesStructure: true
{{- end }}
{{- end }}

View File

@ -0,0 +1,71 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: adapter-config
namespace: monitoring
data:
config.yaml: |-
"resourceRules":
"cpu":
"containerLabel": "container"
"containerQuery": |
sum by (<<.GroupBy>>) (
irate (
container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="",pod!=""}[120s]
)
)
"nodeQuery": |
sum by (<<.GroupBy>>) (
1 - irate(
node_cpu_seconds_total{mode="idle"}[60s]
)
* on(namespace, pod) group_left(node) (
node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}
)
)
or sum by (<<.GroupBy>>) (
1 - irate(
windows_cpu_time_total{mode="idle", job="windows-exporter",<<.LabelMatchers>>}[4m]
)
)
"resources":
"overrides":
"namespace":
"resource": "namespace"
"node":
"resource": "node"
"pod":
"resource": "pod"
"memory":
"containerLabel": "container"
"containerQuery": |
sum by (<<.GroupBy>>) (
container_memory_working_set_bytes{<<.LabelMatchers>>,container!="",pod!=""}
)
"nodeQuery": |
sum by (<<.GroupBy>>) (
node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>}
-
node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}
)
or sum by (<<.GroupBy>>) (
windows_cs_physical_memory_bytes{job="windows-exporter",<<.LabelMatchers>>}
-
windows_memory_available_bytes{job="windows-exporter",<<.LabelMatchers>>}
)
"resources":
"overrides":
"instance":
"resource": "node"
"namespace":
"resource": "namespace"
"pod":
"resource": "pod"
"window": "5m"

View File

@ -0,0 +1,135 @@
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
metricsJob: node-exporter
cadvisormetricsJob: cadvisor
nodeExportermetricsJob: node
name: grafana-agent
namespace: "{{ .Values.nfc_monitoring.grafana_agent.namespace }}"
spec:
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
metricsJob: node-exporter
cadvisormetricsJob: cadvisor
nodeExportermetricsJob: node
template:
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 8 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
metricsJob: node-exporter
cadvisormetricsJob: cadvisor
nodeExportermetricsJob: node
spec:
automountServiceAccountToken: true
containers:
- args:
- --server.http.address=0.0.0.0:12345
- --config.file=/etc/agent/agent.yaml
- --config.expand-env=true
name: grafana-agent
image: "{{ .Values.nfc_monitoring.grafana_agent.image.name }}:{{ .Values.nfc_monitoring.grafana_agent.image.tag }}"
#imagePullPolicy: Never
ports:
- containerPort: 12345
name: grafana-metrics
protocol: TCP
resources:
limits:
cpu: 1000m
memory: 180Mi
requests:
cpu: 40m
memory: 180Mi
securityContext:
capabilities:
add:
- SYS_TIME
# drop:
# - ALL
readOnlyRootFilesystem: false
privileged: true
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/proc
mountPropagation: HostToContainer
name: proc
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: rootfs
readOnly: true
- mountPath: /var/log
mountPropagation: HostToContainer
name: logs
readOnly: true
- name: config
mountPath: "/etc/agent"
readOnly: false
- name: temp
mountPath: "/tmp"
readOnly: false
- name: agent-data
mountPath: "/etc/agent/data"
readOnly: false
dnsPolicy: ClusterFirst
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /proc
name: proc
- hostPath:
path: /
name: rootfs
- hostPath:
path: /var/log
name: logs
- name: config
configMap:
name: grafana-agent
items:
- key: "agent.yaml"
path: "agent.yaml"
- name: temp
emptyDir: {}
- name: agent-data
emptyDir: {}
- name: var-run
hostPath:
path: /var/run
- name: containerd
hostPath:
path: /var/lib/contairnerd
- name: disk
hostPath:
path: /dev/disk
nodeSelector:
kubernetes.io/os: linux
hostNetwork: true
hostPID: true
priorityClassName: system-cluster-critical
serviceAccountName: grafana-agent
tolerations:
- operator: Exists

View File

@ -0,0 +1,139 @@
{{ if .Values.nfc_monitoring.kube_monitor_proxy.enabled }}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
nodeExportermetricsJob: node
name_kcm: kube-controller-manager
name: kube-monitor-proxy
namespace: "{{ .Values.nfc_monitoring.kube_monitor_proxy.namespace }}"
spec:
selector:
matchLabels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
template:
metadata:
labels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
automountServiceAccountToken: true
hostNetwork: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
containers:
- args:
- --logtostderr
- --v=10
- --secure-listen-address=[$(IP)]:11257
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=https://127.0.0.1:10257/
- --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
env:
- name: NODE-IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.hostIP
- name: IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}"
name: kube-rbac-proxy-kube-ctrl-mgr
ports:
- containerPort: 10257
#hostPort: 9100
name: kube-ctrl-mgr
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
- args:
- --logtostderr
- --v=10
- --secure-listen-address=[$(IP)]:11259
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=https://127.0.0.1:10259/
- --client-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- --upstream-ca-file=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt
env:
- name: NODE-IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.hostIP
- name: IP
valueFrom:
fieldRef:
fieldPath: status.podIP
image: "{{ .Values.nfc_monitoring.kube_rbac_proxy.image.name }}:{{ .Values.nfc_monitoring.kube_rbac_proxy.image.tag }}"
name: kube-rbac-proxy-kube-scheduler
ports:
- containerPort: 10259
#hostPort: 9100
name: kube-scheduler
resources:
limits:
cpu: 200m
memory: 256Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
nodeSelector:
kubernetes.io/os: linux
hostPID: true
priorityClassName: system-cluster-critical
serviceAccountName: kube-monitor-proxy
tolerations:
- operator: Exists
{{ end }}

View File

@ -0,0 +1,115 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: kube-state-metrics
namespace: "{{ .Values.nfc_monitoring.kube_state_metrics.namespace }}"
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: kube-state-metrics
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
automountServiceAccountToken: true
containers:
- args:
- --host=127.0.0.1
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: "{{ .Values.nfc_monitoring.kube_state_metrics.image.name }}:{{ .Values.nfc_monitoring.kube_state_metrics.image.tag }}"
name: kube-state-metrics
resources:
limits:
cpu: 200m
memory: 250Mi
requests:
cpu: 10m
memory: 190Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsUser: 65534
- args:
- --logtostderr
- --secure-listen-address=:8443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8081/
image: quay.io/brancz/kube-rbac-proxy:v0.14.0
name: kube-rbac-proxy-main
ports:
- containerPort: 8443
name: https-main
resources:
limits:
cpu: 100m
memory: 40Mi
requests:
cpu: 50m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
- args:
- --logtostderr
- --secure-listen-address=:9443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305
- --upstream=http://127.0.0.1:8082/
image: quay.io/brancz/kube-rbac-proxy:v0.14.0
name: kube-rbac-proxy-self
ports:
- containerPort: 9443
name: https-self
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsGroup: 65532
runAsNonRoot: true
runAsUser: 65532
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: kube-state-metrics
{{ end }}

View File

@ -0,0 +1,99 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-adapter
namespace: "{{ .Values.nfc_monitoring.prometheus_adaptor.namespace }}"
spec:
replicas: 2
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
template:
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 8 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
affinity:
{{- toYaml .Values.nfc_monitoring.prometheus_adaptor.affinity | nindent 8 }}
automountServiceAccountToken: true
containers:
- args:
- --cert-dir=/var/run/serving-cert
- --config=/etc/adapter/config.yaml
- --metrics-relist-interval=1m
- --prometheus-url=https://prometheus.monitoring.svc:9090/
- --secure-port=6443
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,TLS_RSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_256_GCM_SHA384,TLS_RSA_WITH_AES_128_CBC_SHA,TLS_RSA_WITH_AES_256_CBC_SHA
image: "{{ .Values.nfc_monitoring.prometheus_adaptor.image.name }}:{{ .Values.nfc_monitoring.prometheus_adaptor.image.tag }}"
livenessProbe:
failureThreshold: 5
httpGet:
path: /livez
port: https
scheme: HTTPS
initialDelaySeconds: 30
periodSeconds: 5
name: prometheus-adapter
ports:
- containerPort: 6443
name: https
readinessProbe:
failureThreshold: 5
httpGet:
path: /readyz
port: https
scheme: HTTPS
initialDelaySeconds: 30
periodSeconds: 5
resources:
requests:
cpu: 102m
memory: 180Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
terminationMessagePolicy: FallbackToLogsOnError
volumeMounts:
- mountPath: /tmp
name: tmpfs
readOnly: false
- mountPath: /var/run/serving-cert
name: volume-serving-cert
readOnly: false
- mountPath: /etc/adapter
name: config
readOnly: false
nodeSelector:
kubernetes.io/os: linux
securityContext: {}
serviceAccountName: prometheus-adapter
volumes:
- emptyDir: {}
name: tmpfs
- emptyDir: {}
name: volume-serving-cert
- configMap:
name: adapter-config
name: config

View File

@ -0,0 +1,156 @@
---
{{ if .Values.nfc_monitoring.grafana.enabled -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: Grafana
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: grafana
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
spec:
config:
{{ toYaml $.Values.nfc_monitoring.grafana.config | nindent 4 }}
deployment:
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 8 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
replicas: {{ .Values.nfc_monitoring.grafana.replicas | int }}
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 10 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
template:
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 12 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
affinity:
{{- toYaml .Values.nfc_monitoring.grafana.affinity | nindent 12 }}
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }}
automountServiceAccountToken: true
{{ end }}
containers:
- name: grafana
image: "{{ .Values.nfc_monitoring.grafana.image.name }}:{{ .Values.nfc_monitoring.grafana.image.tag }}"
env:
- name: GF_INSTALL_PLUGINS
value: agenty-flowcharting-panel 0.9.1, ddurieux-glpi-app 1.3.1, grafana-oncall-app
- name: GF_PLUGINS_ALLOW_LOADING_UNSIGNED_PLUGINS
value: grafana-oncall-app 1.1.38
- name: GF_ANALYTICS_REPORTING_ENABLED
value: 'false'
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: grafana-http
readinessProbe:
failureThreshold: 3
httpGet:
path: /login
port: 3000
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 3
resources:
limits:
cpu: 2000m
memory: 1Gi
requests:
cpu: 100m
memory: 100Mi
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: false
volumeMounts:
- mountPath: /var/lib/grafana
name: grafana-storage
- mountPath: /etc/grafana/provisioning/plugins
name: plugin-config
readOnly: false
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }}
- mountPath: /etc/grafana/provisioning/dashboards
name: provisioning-config
- mountPath: /var/lib/grafana/dashboards
name: dashboards
- image: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.name }}:{{ .Values.nfc_monitoring.additions.dashboard_sidecar.image.tag}}"
name: sidecar
env:
- name: LABEL
value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_name }}"
- name: LABEL_VALUE
value: "{{ .Values.nfc_monitoring.additions.dashboard_sidecar.label_value }}"
- name: FOLDER
value: /var/lib/grafana/dashboards
- name: NAMESPACE
value: grafana
- name: RESOURCE
value: configmap
volumeMounts:
- mountPath: /var/lib/grafana/dashboards
name: dashboards
{{ end }}
securityContext:
fsGroup: 65534
volumes:
#- name: grafana-storage
- name: plugin-config
configMap:
# Provide the name of the ConfigMap you want to mount.
name: grafana-config
# An array of keys from the ConfigMap to create as files
items:
- key: "oncall-app.yaml"
path: "oncall-app.yaml"
{{ if .Values.nfc_monitoring.additions.dashboard_sidecar.enabled }}
- name: dashboards
emptyDir: {}
- name: provisioning-config
configMap:
name: provisioning-config
{{ end }}
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-pvc
serviceAccountName: grafana
nodeSelector:
kubernetes.io/os: linux
persistentVolumeClaim:
metadata:
annotations:
pv.beta.kubernetes.io/gid: "65534"
labels:
app.kubernetes.io/name: grafana
app.kubernetes.io/component: graphing
app.kubernetes.io/part-of: {{ $.Chart.Name }}
spec:
accessModes:
- {{ .Values.nfc_monitoring.grafana.storage_accessModes | default "ReadWriteOnce" }}
resources:
requests:
storage: "5Gi"
{{- end }}

View File

@ -0,0 +1,21 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: alertmanager
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
spec:
allowCrossNamespaceImport: true
folder: No Fuss Monitoring
resyncPeriod: 1d
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
grafanaCom:
id: 9578
revision: 4 # as @ 19-09-23
{{- end }}

View File

@ -0,0 +1,23 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
{{- if .Values.nfc_monitoring.additions.ceph.enabled | default false -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: ceph
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
spec:
allowCrossNamespaceImport: true
folder: No Fuss Monitoring
resyncPeriod: 1d
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
grafanaCom:
id: 2842
revision: 17 # as @ 19-09-23
{{- end -}}
{{- end }}

View File

@ -0,0 +1,25 @@
---
{{ if .Values.nfc_monitoring.grafana.enabled -}}
{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" }}
{{- if .Values.nfc_monitoring.prometheus.service_monitor.calico }}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: calico-felix
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
spec:
allowCrossNamespaceImport: true
folder: No Fuss Monitoring
resyncPeriod: 1d
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
grafanaCom:
id: 12175
revision: 5 # as @ 2020-05-04T16:47:08
{{- end }}
{{ end }}
{{ end }}

View File

@ -0,0 +1,21 @@
---
{{ if .Values.nfc_monitoring.grafana.dashboards.cert_manager | default false -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: node-exporter
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
spec:
allowCrossNamespaceImport: true
folder: No Fuss Monitoring
resyncPeriod: 1d
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
grafanaCom:
id: 11001
revision: 1 # as @ 2019-10-16T13:48:56
{{- end }}

View File

@ -0,0 +1,21 @@
---
{{ if .Values.nfc_monitoring.grafana.enabled -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: cluster-summary
namespace: grafana
spec:
allowCrossNamespaceImport: false
folder: No Fuss Monitoring
resyncPeriod: 30s
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
{{ $Dashboard := .Files.Get "files/dashboard-summary.json" | fromJson }}
json: >-
{{ $Dashboard | toRawJson }}
{{- end }}

View File

@ -0,0 +1,21 @@
---
{{ if .Values.nfc_monitoring.grafana.enabled -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDashboard
metadata:
name: node-exporter
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
spec:
allowCrossNamespaceImport: true
folder: No Fuss Monitoring
resyncPeriod: 1d
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
grafanaCom:
id: 1860
revision: 32 # as @ 19-09-23
{{- end }}

View File

@ -0,0 +1,28 @@
---
{{ if .Values.nfc_monitoring.grafana.enabled -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasourceList
items:
{{ range .Values.nfc_monitoring.grafana.DataSources }}
- apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasource
metadata:
name: {{ .name }}
namespace: "{{ $.Values.nfc_monitoring.grafana.namespace }}"
labels:
app.kubernetes.io/component: dashboard
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
instanceSelector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 10 }}
allowCrossNamespaceImport: true
datasource:
{{ toYaml . | nindent 8 }}
{{ end }}
{{- end }}

View File

@ -0,0 +1,85 @@
{{ if .Values.nfc_monitoring.alert_manager.ingress.enabled -}}
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager
namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}"
annotations:
{{ toYaml $.Values.nfc_monitoring.alert_manager.ingress.annotations | nindent 4 }}
spec:
tls:
- hosts:
- {{ .Values.nfc_monitoring.alert_manager.ingress.hostname }}
secretName: certificate-tls-alert-manager
rules:
- host: {{ .Values.nfc_monitoring.alert_manager.ingress.hostname }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: alertmanager-{{ $.Release.Name }}
port:
name: web
{{ end }}
{{ if .Values.nfc_monitoring.grafana.enabled -}}
{{ if .Values.nfc_monitoring.grafana.ingress.enabled -}}
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
annotations:
{{ toYaml $.Values.nfc_monitoring.grafana.ingress.annotations | nindent 4 }}
spec:
tls:
- hosts:
- {{ .Values.nfc_monitoring.grafana.ingress.hostname }}
secretName: certificate-tls-grafana
rules:
- host: {{ .Values.nfc_monitoring.grafana.ingress.hostname }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
name: grafana-http
{{ end }}
{{ end }}
{{ if .Values.nfc_monitoring.prometheus.ingress.enabled | default "false" -}}
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus
namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}"
annotations:
{{ toYaml $.Values.nfc_monitoring.prometheus.ingress.annotations | nindent 4 }}
spec:
tls:
- hosts:
- {{ .Values.nfc_monitoring.prometheus.ingress.hostname }}
secretName: certificate-tls-prometheus
rules:
- host: {{ .Values.nfc_monitoring.prometheus.ingress.hostname }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus-{{ $.Release.Name }}
port:
name: web
{{ end }}

View File

@ -0,0 +1,37 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.8.1
name: kube-state-metrics
namespace: monitoring
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 8443
protocol: TCP
- port: 9443
protocol: TCP
podSelector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress
{{ end }}

View File

@ -0,0 +1,313 @@
{{- if .Values.nfc_monitoring.network_policy.enabled -}}
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
{{ toYaml (get $.Values.nfc_monitoring .name ).labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: {{ .name | replace "_" "-" }}
namespace: {{ (get $.Values.nfc_monitoring .name ).namespace }}
spec:
- name: prometheus
policy:
egress: # ToDo: add further restrictions to egress. is variable lookup possible to obtain values????
# - {}
- to: # Alert Manager
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: alerting
podSelector:
matchLabels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
ports:
- port: 9093
protocol: TCP
- to: # Ceph
- ipBlock:
cidr: 172.16.10.0/24
ports:
- port: 9283
protocol: TCP
- to: # Grafana
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: grafana
podSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: grafana
ports:
- port: 3000
protocol: TCP
- to: # Grafana Agent
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
podSelector:
matchLabels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/component: exporter
app.kubernetes.io/name: grafana-agent
ports:
- port: 12345
protocol: TCP
- to: # Kube DNS
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: TCP
- port: 53
protocol: UDP
- to:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
# namespaceSelector:
# matchLabels:
# kubernetes.io/metadata.name: monitoiring
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: metrics
ports: []
- {} # ToDo: Temp rule: Allow All. this rule MUST be removed when egress has been refactored
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
# namespaceSelector:
# matchLabels:
# kubernetes.io/metadata.name: monitoiring
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: metrics
ports: []
# - port: 8080
# protocol: TCP
# - port: 9090
# protocol: TCP
# - port: 10901
# protocol: TCP
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: grafana
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: grafana
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus-adapter
namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
ports:
- port: 9090
protocol: TCP
- from: []
ports: []
policyTypes:
- Egress
- Ingress
podSelector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
{{ toYaml (get $.Values.nfc_monitoring .name ).labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: {{ .name | replace "_" "-" }}
namespace: {{ (get $.Values.nfc_monitoring .name ).namespace }}
spec:
- name: grafana
policy:
egress:
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: alerting
podSelector:
matchLabels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/component: alert-router
app.kubernetes.io/name: alertmanager
ports:
- port: 9093
protocol: TCP
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: logging
podSelector:
matchLabels:
app.kubernetes.io/component: gateway
app.kubernetes.io/instance: loki
app.kubernetes.io/name: loki
ports:
- port: 80 # Service Port
protocol: TCP
- port: 8080 # Pod Port
protocol: TCP
- to:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
podSelector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: prometheus
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: metrics
podSelector:
matchLabels:
app.kubernetes.io/component: query-layer
app.kubernetes.io/instance: thanos-query
app.kubernetes.io/name: thanos-query
ports:
- port: 9090
protocol: TCP
- to: [] # Requires internet access for plugins and dashboard downloading
ports:
- port: 443
protocol: TCP
- to: # Kube DNS
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: TCP
- port: 53
protocol: UDP
ingress:
- from: []
ports:
- port: 3000
protocol: TCP
policyTypes:
- Egress
- Ingress
podSelector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 8 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
{{ toYaml (get $.Values.nfc_monitoring .name ).labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: {{ .name | replace "_" "-" }}
namespace: {{ (get $.Values.nfc_monitoring .name ).namespace }}
spec:
- name: grafana_agent
policy:
egress:
- to: # Logging
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: logging
podSelector:
matchLabels:
app.kubernetes.io/component: gateway
app.kubernetes.io/instance: loki
app.kubernetes.io/name: loki
ports:
- port: 80
protocol: TCP
- to: # Kube DNS
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: kube-system
podSelector:
matchLabels:
k8s-app: kube-dns
ports:
- port: 53
protocol: TCP
- port: 53
protocol: UDP
ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: monitoring
podSelector:
matchLabels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
ports:
- port: 12345
protocol: TCP
policyTypes:
- Egress
- Ingress
podSelector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 8 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
{{ end }}

View File

@ -0,0 +1,19 @@
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: alertmanager-main
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
spec:
maxUnavailable: 1
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}

View File

@ -0,0 +1,19 @@
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-adapter
namespace: monitoring
spec:
minAvailable: 1
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}

View File

@ -0,0 +1,18 @@
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
minAvailable: 1
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}

View File

@ -0,0 +1,68 @@
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: {{ $.Release.Name }}
namespace: "{{ .Values.nfc_monitoring.prometheus.namespace }}"
spec:
affinity:
{{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }}
alerting:
alertmanagers:
- apiVersion: v2
name: alertmanager-{{ $.Release.Name }}
namespace: "{{ .Values.nfc_monitoring.alert_manager.namespace }}"
port: web
enableFeatures: []
externalLabels: {}
image: {{ .Values.nfc_monitoring.prometheus.image.name }}:{{ .Values.nfc_monitoring.prometheus.image.tag}}
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
podMonitorNamespaceSelector: {}
podMonitorSelector: {}
probeNamespaceSelector: {}
probeSelector: {}
{{ if .Values.nfc_monitoring.prometheus.remotewrite }}
remoteWrite: {{ .Values.nfc_monitoring.prometheus.remotewrite | toYaml | nindent 4 }}
{{- end }}
replicas: {{ $.Values.nfc_monitoring.prometheus.replicas }}
resources:
requests:
memory: 400Mi
ruleNamespaceSelector: {}
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: prometheus-{{ $.Release.Name }}
serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {}
storage:
{{- toYaml .Values.nfc_monitoring.prometheus.storage | nindent 4 }}
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
{{ if .Values.nfc_monitoring.thanos.sidecar.config }}
thanos:
image: "{{ .Values.nfc_monitoring.thanos.image.name }}:{{ .Values.nfc_monitoring.thanos.image.tag }}"
objectStorageConfig:
key: thanos.yaml
name: thanos-sidecar-config
{{ end }}
{{ end }}
version: 2.42.0
{{ if .Values.nfc_monitoring.prometheus.additional }}
{{ toYaml .Values.nfc_monitoring.prometheus.additional | nindent 2 }}
{{ end }}

View File

@ -0,0 +1,140 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: alertmanager-main-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerFailedReload
annotations:
description: Configuration has failed to load for {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload
summary: Reloading an Alertmanager configuration has failed.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: AlertmanagerMembersInconsistent
annotations:
description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} has only found {{ `{{` }} $value }} members of the {{ `{{` }}$labels.job}} cluster.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent
summary: A member of an Alertmanager cluster has not found all other cluster members.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
< on (namespace,service) group_left
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]))
for: 15m
labels:
severity: critical
- alert: AlertmanagerFailedToSendAlerts
annotations:
description: Alertmanager {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod}} failed to send {{ `{{` }} $value | humanizePercentage }} of notifications to {{ `{{` }} $labels.integration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts
summary: An Alertmanager instance failed to send notifications.
expr: |
(
rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration=~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterFailedToSendAlerts
annotations:
description: The minimum notification failure rate to {{ `{{` }} $labels.integration }} sent from any instance in the {{ `{{` }}$labels.job}} cluster is {{ `{{` }} $value | humanizePercentage }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts
summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
expr: |
min by (namespace,service, integration) (
rate(alertmanager_notifications_failed_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m])
/
rate(alertmanager_notifications_total{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring", integration!~`.*`}[5m])
)
> 0.01
for: 5m
labels:
severity: warning
- alert: AlertmanagerConfigInconsistent
annotations:
description: Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have different configurations.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent
summary: Alertmanager instances within the same cluster have different configurations.
expr: |
count by (namespace,service) (
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"})
)
!= 1
for: 20m
labels:
severity: critical
- alert: AlertmanagerClusterDown
annotations:
description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have been up for less than half of the last 5m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown
summary: Half or more of the Alertmanager instances within the same cluster are down.
expr: |
(
count by (namespace,service) (
avg_over_time(up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[5m]) < 0.5
)
/
count by (namespace,service) (
up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}
)
)
>= 0.5
for: 5m
labels:
severity: critical
- alert: AlertmanagerClusterCrashlooping
annotations:
description: '{{ `{{` }} $value | humanizePercentage }} of Alertmanager instances within the {{ `{{` }}$labels.job}} cluster have restarted at least 5 times in the last 10m.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping
summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
expr: |
(
count by (namespace,service) (
changes(process_start_time_seconds{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}[10m]) > 4
)
/
count by (namespace,service) (
up{job="alertmanager-{{ $.Release.Name }}",namespace="monitoring"}
)
)
>= 0.5
for: 5m
labels:
severity: critical

View File

@ -0,0 +1,48 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: common
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: common.rules
rules:
- alert: Watchdog
annotations:
description: |
This is an alert is meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager is working properly.
expr: vector(1)
labels:
severity: none
- alert: InfoInhibitor
annotations:
description: |
This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
other alerts.
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
severity of 'warning' or 'critical' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on(namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none

View File

@ -0,0 +1,107 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: grafana-agent
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: grafana_agent
rules:
# - annotations:
# description: "As Grafana Agent is being used, it's version is set as promtails"
- expr: |
agent_build_info
record: promtail_build_info
- name: promtail_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
by (job)
record: job:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
- name: promtail_alerts
rules:
- alert: PromtailRequestsErrors
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
- alert: PromtailRequestLatency
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
- alert: PromtailFileMissing
annotations:
message: |
{{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: warning

View File

@ -0,0 +1,38 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: grafana-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: GrafanaAlerts
rules:
- alert: GrafanaRequestsFailing
annotations:
message: '{{`{{`}} $labels.namespace }}/{{`{{`}} $labels.job }}/{{`{{`}} $labels.handler }} is experiencing {{`{{`}} $value | humanize }}% errors'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing
expr: |
100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."}
/ ignoring (status_code)
sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"})
> 50
for: 5m
labels:
severity: warning
- name: grafana_rules
rules:
- expr: |
sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m]))
record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m
{{- end }}

View File

@ -0,0 +1,63 @@
{{ if .Values.nfc_monitoring.kube_monitor_proxy.enabled }}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-prometheus
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: kube-prometheus-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{ `{{` }} printf "%.4g" $value }}% of the {{ `{{` }} $labels.job }}/{{ `{{` }} $labels.service }} targets in {{ `{{` }} $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, namespace, service)) > 10
for: 10m
labels:
severity: warning
- name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ `{{` }} $labels.device }}" changing its up status often on node-exporter {{ `{{` }} $labels.namespace }}/{{ `{{` }} $labels.pod }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0
{{ end }}

View File

@ -0,0 +1,71 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: kube-state-metrics-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: |
stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) - 1
-
sum( 2 ^ max by (shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) )
!= 0
for: 15m
labels:
severity: critical
{{ end }}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,115 @@
---
{{- if .Values.nfc_monitoring.loki.enabled | default false -}}
{{ if .Values.nfc_monitoring.loki.config }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: logging
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: loki
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: loki
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: loki_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job))
record: cluster_job:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
by (cluster, job)
record: cluster_job:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
record: cluster_job:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, job, route))
record: cluster_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
route)
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, cluster, namespace, job, route))
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
namespace, job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
job, route)
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
- name: loki_alerts
rules:
- alert: LokiRequestErrors
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: |
{{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
for: 15m
labels:
severity: critical
- alert: LokiTooManyCompactorsRunning
annotations:
message: |
{{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: 5m
labels:
severity: warning
{{ end }}
{{- end -}}

View File

@ -0,0 +1,329 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: node-exporter-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: node-exporter
rules:
{{ range $index, $node := (lookup "v1" "Node" "" "").items }}
- alert: NodeExporterJobMissing-{{ $node.metadata.name }}
annotations:
summary: Node Exporter job missing for node {{ $node.metadata.name }}. (instance {{ `{{` }} $labels.instance }})
description: "Node Exporter job has disappeared\n Node = {{ $node.metadata.name }}\n Value = {{ `{{` }} $value }}\n LABELS = {{ `{{` }} $labels }}"
expr: absent(up{job="node-exporter", node="{{ $node.metadata.name }}"})
for: 0m
labels:
severity: critical
{{ end }}
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ `{{` }} $labels.device }} at {{ `{{` }} $labels.instance }} has only {{ `{{` }} printf "%.2f" $value }}% available inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ `{{` }} $labels.instance }} interface {{ `{{` }} $labels.device }} has encountered {{ `{{` }} printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ `{{` }} $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector failed to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: |
node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock on {{ `{{` }} $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
expr: |
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock on {{ `{{` }} $labels.instance }} is not synchronising. Ensure NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ `{{` }} $labels.device }}' on {{ `{{` }} $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded
expr: |
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array on {{ `{{` }} $labels.instance }} failed. Array '{{ `{{` }} $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array
expr: |
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ `{{` }} $labels.instance }} is currently at {{ `{{` }} printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
- name: node-exporter.rules
rules:
- expr: |
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@ -0,0 +1,281 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: prometheus-{{ $.Release.Name }}-prometheus-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: prometheus
rules:
- alert: PrometheusBadConfig
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to reload its configuration.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
summary: Failed Prometheus configuration reload.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{job="prometheus",namespace="monitoring"}[5m]) == 0
for: 10m
labels:
severity: critical
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Alert notification queue of Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is running full.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
summary: Prometheus alert notification queue predicted to run full in less than 30m.
expr: |
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{job="prometheus",namespace="monitoring"}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{job="prometheus",namespace="monitoring"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
annotations:
description: '{{ `{{` }} printf "%.1f" $value }}% errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to Alertmanager {{ `{{` }}$labels.alertmanager}}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.
expr: |
(
rate(prometheus_notifications_errors_total{job="prometheus",namespace="monitoring"}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus",namespace="monitoring"}[5m])
)
* 100
> 1
for: 15m
labels:
severity: warning
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not connected to any Alertmanagers.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
summary: Prometheus is not connected to any Alertmanagers.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus",namespace="monitoring"}[5m]) < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} reload failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
summary: Prometheus has issues reloading blocks from disk.
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus",namespace="monitoring"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has detected {{ `{{` }}$value | humanize}} compaction failures over the last 3h.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
summary: Prometheus has issues compacting blocks.
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus",namespace="monitoring"}[3h]) > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is not ingesting samples.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
summary: Prometheus is not ingesting samples.
expr: |
(
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus",namespace="monitoring"}[5m]) <= 0
and
(
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus",namespace="monitoring"}) > 0
or
sum without(rule_group) (prometheus_rule_group_rules{job="prometheus",namespace="monitoring"}) > 0
)
)
for: 10m
labels:
severity: warning
- alert: PrometheusDuplicateTimestamps
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with different values but duplicated timestamp.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
summary: Prometheus is dropping samples with duplicate timestamps.
expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusOutOfOrderTimestamps
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} is dropping {{ `{{` }} printf "%.4g" $value }} samples/s with timestamps arriving out of order.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
summary: Prometheus drops samples with out-of-order timestamps.
expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 10m
labels:
severity: warning
- alert: PrometheusRemoteStorageFailures
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} failed to send {{ `{{` }} printf "%.1f" $value }}% of the samples to {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
summary: Prometheus fails to send samples to remote storage.
expr: |
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus",namespace="monitoring"}[5m]))
/
(
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="prometheus",namespace="monitoring"}[5m]))
+
(rate(prometheus_remote_storage_succeeded_samples_total{job="job="prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="prometheus",namespace="monitoring"}[5m]))
)
)
* 100
> 1
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteBehind
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write is {{ `{{` }} printf "%.1f" $value }}s behind for {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
summary: Prometheus remote write is behind.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus",namespace="monitoring"}[5m])
- ignoring(remote_name, url) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus",namespace="monitoring"}[5m])
)
> 120
for: 15m
labels:
severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} remote write desired shards calculation wants to run {{ `{{` }} $value }} shards for queue {{ `{{` }} $labels.remote_name}}:{{ `{{` }} $labels.url }}, which is more than the max of {{ `{{` }} printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus",namespace="monitoring"}` $labels.instance | query | first | value }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
summary: Prometheus remote write desired shards calculation wants to run more than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus",namespace="monitoring"}[5m])
>
max_over_time(prometheus_remote_storage_shards_max{job="prometheus",namespace="monitoring"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed to evaluate {{ `{{` }} printf "%.0f" $value }} rules in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
summary: Prometheus is failing rule evaluations.
expr: |
increase(prometheus_rule_evaluation_failures_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: critical
- alert: PrometheusMissingRuleEvaluations
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has missed {{ `{{` }} printf "%.0f" $value }} rule group evaluations in the last 5m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
expr: |
increase(prometheus_rule_group_iterations_missed_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetLimitHit
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusLabelLimitHit
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has dropped {{ `{{` }} printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit.
expr: |
increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeBodySizeLimitHit
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
summary: Prometheus has dropped some targets that exceeded body size limit.
expr: |
increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusScrapeSampleLimitHit
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} has failed {{ `{{` }} printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
summary: Prometheus has failed scrapes that have exceeded the configured sample limit.
expr: |
increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="prometheus",namespace="monitoring"}[5m]) > 0
for: 15m
labels:
severity: warning
- alert: PrometheusTargetSyncFailure
annotations:
description: '{{ `{{` }} printf "%.0f" $value }} targets in Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} have failed to sync because invalid configuration was supplied.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
summary: Prometheus has failed to sync targets.
expr: |
increase(prometheus_target_sync_failed_total{job="prometheus",namespace="monitoring"}[30m]) > 0
for: 5m
labels:
severity: critical
- alert: PrometheusHighQueryLoad
annotations:
description: Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
summary: Prometheus is reaching its maximum capacity serving concurrent requests.
expr: |
avg_over_time(prometheus_engine_queries{job="prometheus",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="prometheus",namespace="monitoring"}[5m]) > 0.8
for: 15m
labels:
severity: warning
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
annotations:
description: '{{ `{{` }} printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{ `{{` }}$labels.namespace}}/{{ `{{` }}$labels.pod}} to any Alertmanager.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
expr: |
min without (alertmanager) (
rate(prometheus_notifications_errors_total{job="prometheus",namespace="monitoring",alertmanager!~``}[5m])
/
rate(prometheus_notifications_sent_total{job="prometheus",namespace="monitoring",alertmanager!~``}[5m])
)
* 100
> 3
for: 15m
labels:
severity: critical

View File

@ -0,0 +1,58 @@
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
{{ if .Values.nfc_monitoring.thanos.sidecar.config }}
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: metrics
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: thanos
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
prometheus: {{ $.Release.Name }}
role: alert-rules
name: thanos-sidecar-rules
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: thanos-sidecar
rules:
- alert: ThanosSidecarBucketOperationsFailed
annotations:
description: Thanos Sidecar {{ `{{` }}$labels.instance}} bucket operations are failing
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarbucketoperationsfailed
summary: Thanos Sidecar bucket operations are failing
expr: |
sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0
for: 5m
labels:
severity: critical
- alert: ThanosSidecarNoConnectionToStartedPrometheus
annotations:
description: Thanos Sidecar {{ `{{` }}$labels.instance}} is unhealthy.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarnoconnectiontostartedprometheus
summary: Thanos Sidecar cannot access Prometheus, even though Prometheus seems
healthy and has reloaded WAL.
expr: |
thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0
AND on (namespace, pod)
prometheus_tsdb_data_replay_duration_seconds != 0
for: 5m
labels:
severity: critical
- alert: ThanosSidecarIsDown
annotations:
description: ThanosSidecar has disappeared. Prometheus target for the component
cannot be discovered.
runbook_url: https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarisdown
summary: Thanos component has disappeared.
expr: |
absent(up{job=~".*thanos-sidecar.*"} == 1)
for: 5m
labels:
severity: critical
{{ end }}
{{ end }}

View File

@ -0,0 +1,46 @@
---
apiVersion: rbac.authorization.k8s.io/v1
items:
{{ range $index, $namespace := (lookup "v1" "Namespace" "" "").items }}
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
namespace: {{ $namespace.metadata.name | quote }}
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
{{ end }}
kind: RoleList

View File

@ -0,0 +1,20 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-k8s-config
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-{{ $.Release.Name }}-config
subjects:
- kind: ServiceAccount
name: prometheus-{{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}

View File

@ -0,0 +1,26 @@
---
apiVersion: rbac.authorization.k8s.io/v1
items:
{{ range $index, $namespace := (lookup "v1" "Namespace" "" "").items }}
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 6 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
namespace: {{ $namespace.metadata.name | quote }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-{{ $.Release.Name }}
subjects:
- kind: ServiceAccount
name: prometheus-{{ $.Release.Name }}
namespace: {{ $.Values.nfc_monitoring.prometheus.namespace }}
{{ end }}
kind: RoleBindingList

View File

@ -0,0 +1,18 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/version: 0.11.1
name: resource-metrics-auth-reader
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,19 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}-config
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
rules:
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get

View File

@ -0,0 +1,60 @@
---
apiVersion: v1
kind: Secret
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: alertmanager-main
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
"inhibit_rules":
- "equal":
- "namespace"
- "alertname"
"source_matchers":
- "severity = critical"
"target_matchers":
- "severity =~ warning|info"
- "equal":
- "namespace"
- "alertname"
"source_matchers":
- "severity = warning"
"target_matchers":
- "severity = info"
- "equal":
- "namespace"
"source_matchers":
- "alertname = InfoInhibitor"
"target_matchers":
- "severity = info"
"receivers":
- "name": "Default"
- "name": "Watchdog"
- "name": "Critical"
- "name": "null"
"route":
"group_by":
- "namespace"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "Default"
"repeat_interval": "12h"
"routes":
- "matchers":
- "alertname = Watchdog"
"receiver": "Watchdog"
- "matchers":
- "alertname = InfoInhibitor"
"receiver": "null"
- "matchers":
- "severity = critical"
"receiver": "Critical"
type: Opaque

View File

@ -0,0 +1,21 @@
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
{{ if .Values.nfc_monitoring.thanos.sidecar.config }}
---
apiVersion: v1
kind: Secret
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: thanos-sidecar-config
namespace: monitoring
type: Opaque
stringData:
thanos.yaml: |-
{{ toYaml .Values.nfc_monitoring.thanos.sidecar.config | nindent 4 }}
{{ end }}
{{ end }}

View File

@ -0,0 +1,20 @@
---
{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}}
apiVersion: v1
kind: Service
metadata:
name: calico-metrics
namespace: calico-system
labels:
k8s-app: calico-node
spec:
clusterIP: None
selector:
k8s-app: calico-node
ports:
- port: 9091
name: metrics
targetPort: 9091
{{- end -}}

View File

@ -0,0 +1,30 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
selector:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
#type: NodePort
#type: LoadBalancer
#clusterIP: None
ports:
- name: grafana
port: 3000
targetPort: grafana-http
#nodePort: 3000
#type: LoadBalancer
sessionAffinity: ClientIP
{{- end }}

View File

@ -0,0 +1,26 @@
---
apiVersion: v1
kind: Service
metadata:
name: grafana-agent
namespace: monitoring
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
selector:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
ports:
- name: grafana-metrics
port: 12345
targetPort: grafana-metrics
- name: kube-ctrl-mgr
port: 11257
targetPort: kube-ctrl-mgr
#type: LoadBalancer
sessionAffinity: ClientIP

View File

@ -0,0 +1,26 @@
---
apiVersion: v1
kind: Service
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: alertmanager-{{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}
spec:
ports:
- name: web
port: 9093
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: {{ $.Chart.Name }}
sessionAffinity: ClientIP

View File

@ -0,0 +1,33 @@
{{ if .Values.nfc_monitoring.kube_monitor_proxy.enabled }}
---
apiVersion: v1
kind: Service
metadata:
name: kube-monitor-proxy
namespace: monitoring
labels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name_kcm: kube-controller-manager
name_ks: kube-scheduler
spec:
selector:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/part-of: {{ $.Chart.Name }}
ports:
- name: kube-ctrl-mgr
port: 10257
targetPort: kube-ctrl-mgr
- name: kube-scheduler
port: 10259
targetPort: kube-scheduler
sessionAffinity: ClientIP
{{ end }}

View File

@ -0,0 +1,30 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: kube-state-metrics
namespace: monitoring
spec:
clusterIP: None
ports:
- name: https-main
port: 8443
targetPort: https-main
- name: https-self
port: 9443
targetPort: https-self
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/part-of: {{ $.Chart.Name }}
{{ end }}

View File

@ -0,0 +1,21 @@
---
apiVersion: v1
kind: Service
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-adapter
namespace: monitoring
spec:
ports:
- name: https
port: 443
targetPort: 6443
selector:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}

View File

@ -0,0 +1,55 @@
---
apiVersion: v1
kind: Service
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
ports:
- name: web
port: 9090
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
sessionAffinity: ClientIP
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus-sidecar
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: thanos-sidecar
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: thanos-sidecar
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
clusterIP: None
ports:
- name: grpc
port: 10901
targetPort: 10901
- name: http
port: 10902
targetPort: 10902
selector:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
{{ end }}

View File

@ -0,0 +1,16 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
name: grafana
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
{{- end }}

View File

@ -0,0 +1,13 @@
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
name: grafana-agent
namespace: monitoring
labels:
{{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}

View File

@ -0,0 +1,13 @@
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.alert_manager.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: alertmanager-main
namespace: {{ .Values.nfc_monitoring.alert_manager.namespace | quote }}

View File

@ -0,0 +1,16 @@
{{ if .Values.nfc_monitoring.kube_monitor_proxy.enabled }}
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-monitor-proxy
namespace: monitoring
labels:
app.kubernetes.io/component: proxy
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-monitor-proxy
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
{{ end }}

View File

@ -0,0 +1,17 @@
{{ if .Values.nfc_monitoring.kube_state_metrics.enabled }}
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: kube-state-metrics
namespace: monitoring
{{ end }}

View File

@ -0,0 +1,13 @@
---
apiVersion: v1
automountServiceAccountToken: false
kind: ServiceAccount
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus_adaptor.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-adapter
namespace: monitoring

View File

@ -0,0 +1,13 @@
---
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.prometheus.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: prometheus-{{ $.Release.Name }}
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}

View File

@ -0,0 +1,82 @@
{{ if .Values.nfc_monitoring.prometheus.service_monitor.apiserver }}
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: apiserver
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
name: kube-apiserver
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|object_counts|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|admission_quota_controller_depth|admission_quota_controller_longest_running_processor_microseconds|admission_quota_controller_queue_latency|admission_quota_controller_unfinished_work_seconds|admission_quota_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|APIServiceOpenAPIAggregationControllerQueue1_depth|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_retries|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_adds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|APIServiceRegistrationController_queue_latency|APIServiceRegistrationController_retries|APIServiceRegistrationController_unfinished_work_seconds|APIServiceRegistrationController_work_duration|autoregister_adds|autoregister_depth|autoregister_longest_running_processor_microseconds|autoregister_queue_latency|autoregister_retries|autoregister_unfinished_work_seconds|autoregister_work_duration|AvailableConditionController_adds|AvailableConditionController_depth|AvailableConditionController_longest_running_processor_microseconds|AvailableConditionController_queue_latency|AvailableConditionController_retries|AvailableConditionController_unfinished_work_seconds|AvailableConditionController_work_duration|crd_autoregistration_controller_adds|crd_autoregistration_controller_depth|crd_autoregistration_controller_longest_running_processor_microseconds|crd_autoregistration_controller_queue_latency|crd_autoregistration_controller_retries|crd_autoregistration_controller_unfinished_work_seconds|crd_autoregistration_controller_work_duration|crdEstablishing_adds|crdEstablishing_depth|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_queue_latency|crdEstablishing_retries|crdEstablishing_unfinished_work_seconds|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_finalizer_longest_running_processor_microseconds|crd_finalizer_queue_latency|crd_finalizer_retries|crd_finalizer_unfinished_work_seconds|crd_finalizer_work_duration|crd_naming_condition_controller_adds|crd_naming_condition_controller_depth|crd_naming_condition_controller_longest_running_processor_microseconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|crd_naming_condition_controller_unfinished_work_seconds|crd_naming_condition_controller_work_duration|crd_openapi_controller_adds|crd_openapi_controller_depth|crd_openapi_controller_longest_running_processor_microseconds|crd_openapi_controller_queue_latency|crd_openapi_controller_retries|crd_openapi_controller_unfinished_work_seconds|crd_openapi_controller_work_duration|DiscoveryController_adds|DiscoveryController_depth|DiscoveryController_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_retries|DiscoveryController_unfinished_work_seconds|DiscoveryController_work_duration|kubeproxy_sync_proxy_rules_latency_microseconds|non_structural_schema_condition_controller_adds|non_structural_schema_condition_controller_depth|non_structural_schema_condition_controller_longest_running_processor_microseconds|non_structural_schema_condition_controller_queue_latency|non_structural_schema_condition_controller_retries|non_structural_schema_condition_controller_unfinished_work_seconds|non_structural_schema_condition_controller_work_duration|rest_client_request_latency_seconds|storage_operation_errors_total|storage_operation_status_count)
sourceLabels:
- __name__
- action: drop
regex: etcd_(debugging|disk|server).*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_controller_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_admission_step_admission_latencies_seconds_.*
sourceLabels:
- __name__
- action: drop
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
sourceLabels:
- __name__
- le
port: https
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
jobLabel: component
namespaceSelector:
matchNames:
- default
selector:
matchLabels:
component: apiserver
provider: kubernetes
{{ end }}

View File

@ -0,0 +1,56 @@
{{ if .Values.nfc_monitoring.prometheus.service_monitor.cadvisor }}
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: cadvisor
app.kubernetes.io/component: exporter
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
name: cadvisor
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
honorTimestamps: false
interval: 30s
metricRelabelings:
- action: drop
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
sourceLabels:
- __name__
- action: drop
regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
sourceLabels:
- __name__
- pod
- namespace
- action: drop
regex: (container_blkio_device_usage_total);.+
sourceLabels:
- __name__
- container
path: /metrics/cadvisor
port: https-metrics
#jobName: cadvisor
relabelings:
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path
- targetLabel: "job"
replacement: "cadvisor"
scheme: https
tlsConfig:
insecureSkipVerify: true
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kubelet
{{ end }}

View File

@ -0,0 +1,39 @@
---
{{- if eq .Values.nfc_monitoring.kubernetes.networking "calico" -}}
{{ if .Values.nfc_monitoring.prometheus.service_monitor.calico }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: calico
app.kubernetes.io/component: networking
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
k8s-app: calico-node
name: calico
namespace: calico-system
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
port: metrics
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
scheme: http
# tlsConfig:
# insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
k8s-app: calico-node
{{- end -}}
{{ end }}

View File

@ -0,0 +1,32 @@
{{ if .Values.nfc_monitoring.prometheus.service_monitor.coredns }}
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/name: coredns
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
name: coredns
namespace: monitoring
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
metricRelabelings:
- action: drop
regex: coredns_cache_misses_total
sourceLabels:
- __name__
port: metrics
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
k8s-app: kube-dns
{{ end }}

View File

@ -0,0 +1,39 @@
{{ if .Values.nfc_monitoring.grafana.enabled -}}
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }}
app.kubernetes.io/instance: {{ $.Release.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: grafana
namespace: {{ .Values.nfc_monitoring.grafana.namespace }}
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s
port: grafana
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_label_app_kubernetes_io_instance
targetLabel: instance
scheme: http
# tlsConfig:
# insecureSkipVerify: true
targetLabels:
- cluster
jobLabel: app.kubernetes.io/name
selector:
matchLabels:
{{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 6 }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/instance: {{ $.Release.Name }}
{{- end }}

Some files were not shown because too many files have changed in this diff Show More