Merge branch '1-metrics-long-term-storage' into 'development'

feat(metrics): long term storage injection

Closes #1

See merge request nofusscomputing/projects/kubernetes_monitoring!2
This commit is contained in:
2023-09-27 15:31:08 +00:00
11 changed files with 311 additions and 156 deletions

View File

@ -18,7 +18,7 @@ What this chart is:
- Ready for logging long term storag integration, in particular Loki
- Ready for metrics long term storage integration, **Still to be developed, see [GL-#1](https://gitlab.com/nofusscomputing/projects/kubernetes_monitoring/-/issues/1)**
- Ready for metrics long term storage integration
What this chart is not:
@ -69,7 +69,7 @@ This helm chart started off with components from multiple open-source projects.
- Kubernetes
- loki _Loki [helm chart](https://artifacthub.io/packages/helm/grafana/loki) included, see [loki repo](https://github.com/grafana/loki/tree/f4ab1e3e89ac66e1848764dc17826abde929fdc5/production/loki-mixin-compiled) for dashboards_
- Loki [helm chart](https://artifacthub.io/packages/helm/grafana/loki) included dashboards, see [loki repo](https://github.com/grafana/loki/tree/f4ab1e3e89ac66e1848764dc17826abde929fdc5/production/loki-mixin-compiled) for dashboards_
- Node-Exporter
@ -77,7 +77,9 @@ This helm chart started off with components from multiple open-source projects.
- Promtail
- Prometheus as the metrics collector
- Prometheus
- Thanos sidecar _Optional_
- Service monitors
@ -111,6 +113,8 @@ This helm chart started off with components from multiple open-source projects.
- Prometheus-Adaptor
- Thanos
- kyverno policies _(optional, set in values.yaml)_
- auto deploy policy for prometheus role and rolebinding to access other namespaces
@ -136,6 +140,50 @@ helm upgrade -i nfc_monitoring kubernetes_monitoring/
```
## Metrics Storage
There are two different ways that this chart porvides to route your metrics to long-term storage.
- Prometheus `remote_write`
- Thanos Side Car container
### Prometheus Remote Write
Using this method will have prometheus pushing it's metrics to another service to store its metrics. This option for example could be used to push metrics to Grafana Mimir. To configure add the following to the values.yaml file at path `nfc_monitoring.prometheus.additional`:
``` yaml
remoteWrite:
- name: mimir
url: http://mimir-gateway.metrics.svc.cluster.local/api/v1/push
```
Ensure that you set the url to the correct url.
### Thanos Side Car
This method will have a Thanos container deployed within the prometheus pod. This container will then read and upload the Prometheus containers tsdb to the configured storage. To configure add the following to the values.yaml
- `monitoring.thanos.sidecar.enabled` set to bool `true`
- `nfc_monitoring.thanos.sidecar.config` updated to include the sidecar config.
For example to configure the Thanos sideCar to upload Prometheus metrics to S3 storage use (updating values to suit your environment):
``` yaml
type: S3
config:
bucket: "thanos-metrics"
endpoint: "rook-ceph-rgw-earth.ceph.svc:80"
access_key: "7J5NM2MNCDB4T4Y9OKJ5"
secret_key: "t9r69RzZdWEBL3NCKiUIpDk6j5625xc6HucusiGG"
```
## Template Values

View File

@ -18,7 +18,7 @@
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 27,
"id": 66,
"links": [],
"liveNow": false,
"panels": [
@ -36,7 +36,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "Total cluster CPU Cores not being utilised",
"fieldConfig": {
@ -87,7 +87,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
@ -105,7 +105,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "Total cluster CPU Cores",
"fieldConfig": {
@ -156,7 +156,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
@ -174,7 +174,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "Total memory for complete cluster",
"fieldConfig": {
@ -251,7 +251,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -272,7 +272,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "Total cluster CPU Cores not being utilised",
"fieldConfig": {
@ -352,7 +352,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"exemplar": false,
@ -370,7 +370,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "Total memory for complete cluster",
"fieldConfig": {
@ -418,7 +418,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -435,7 +435,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(node_memory_Cached_bytes{node!=\"\"})",
@ -452,7 +452,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "Total memory for complete cluster",
"fieldConfig": {
@ -500,7 +500,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -534,7 +534,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "",
"fieldConfig": {
@ -617,7 +617,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"expr": "ceph_health_status{}",
@ -646,7 +646,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many logs per minute are being received.",
"fieldConfig": {
@ -694,7 +694,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
@ -716,7 +716,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many logs received in last 24 hours.",
"fieldConfig": {
@ -765,7 +765,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -787,7 +787,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many logs per minute are being received.",
"fieldConfig": {
@ -836,7 +836,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -858,7 +858,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many metrics per minute are being received.",
"fieldConfig": {
@ -906,7 +906,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -928,7 +928,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many metrics in the last 24 hours",
"fieldConfig": {
@ -977,7 +977,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -999,7 +999,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many metrics are stored in the database",
"fieldConfig": {
@ -1049,7 +1049,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "code",
@ -1071,7 +1071,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many logs per minute are being received.",
"fieldConfig": {
@ -1150,7 +1150,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(delta(loki_log_messages_total[1m]))",
@ -1165,7 +1165,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"description": "How many logs per minute are being received.",
"fieldConfig": {
@ -1244,7 +1244,7 @@
{
"datasource": {
"type": "prometheus",
"uid": "77e897ec-8a32-4b71-9439-8f12ec42e920"
"uid": "${PROMETHEUS}"
},
"editorMode": "code",
"expr": "sum(rate(prometheus_tsdb_head_samples_appended_total[1m]))",
@ -1266,7 +1266,27 @@
"Kubernetes"
],
"templating": {
"list": []
"list": [
{
"current": {
"selected": true,
"text": "prometheus",
"value": "77e897ec-8a32-4b71-9439-8f12ec42e920"
},
"hide": 0,
"includeAll": false,
"label": "Prometheus",
"multi": false,
"name": "PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
}
]
},
"time": {
"from": "now-3h",
@ -1276,6 +1296,6 @@
"timezone": "",
"title": "Cluster Overview",
"uid": "no-fuss-computing-k8s-overview",
"version": 3,
"version": 4,
"weekStart": ""
}

View File

@ -1,39 +0,0 @@
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasource
metadata:
name: alertmanager
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
labels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: alertmanager
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
#secrets:
# - credentials
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: grafana
allowCrossNamespaceImport: true
datasource:
name: alertmanager
type: alertmanager
access: proxy
# basicAuth: true
url: "http://alertmanager-main.{{ .Values.nfc_monitoring.alert_manager.namespace }}.svc:9093"
isDefault: false
# user: admin
jsonData:
"tlsSkipVerify": true
"timeInterval": "5s"
"implementation": prometheus
"handleGrafanaManagedAlerts": false
# "orgId": 1
# secureJsonData:
# "password": admin
editable: true

View File

@ -1,38 +0,0 @@
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasource
metadata:
name: prometheus
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
labels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
#secrets:
# - credentials
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: grafana
allowCrossNamespaceImport: true
datasource:
name: prometheus
type: prometheus
access: proxy
# basicAuth: true
url: "http://prometheus-k8s.{{ .Values.nfc_monitoring.prometheus.namespace }}.svc:9090"
isDefault: true
# user: admin
jsonData:
# "tlsSkipVerify": true
manageAlerts: true
"timeInterval": "5s"
"orgId": 1
# secureJsonData:
# "password": admin
editable: true

View File

@ -1,41 +0,0 @@
---
{{- if .Values.nfc_monitoring.loki.enabled | default false -}}
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasource
metadata:
name: loki
namespace: "{{ .Values.nfc_monitoring.grafana.namespace }}"
labels:
aapp.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: loki
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
#secrets:
# - credentials
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: grafana
allowCrossNamespaceImport: true
datasource:
name: loki
type: loki
access: proxy
# basicAuth: true
url: "http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}"
isDefault: false
# user: admin
jsonData:
# "tlsSkipVerify": true
"timeInterval": "5s"
"orgId": 1
# secureJsonData:
# "password": admin
editable: true
{{- end -}}

View File

@ -0,0 +1,28 @@
---
apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasourceList
items:
{{ range .Values.nfc_monitoring.grafana.DataSources }}
- apiVersion: grafana.integreatly.org/v1beta1
kind: GrafanaDatasource
metadata:
name: {{ .name }}
namespace: "{{ $.Values.nfc_monitoring.grafana.namespace }}"
labels:
app.kubernetes.io/component: dashboard
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: grafana
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
spec:
instanceSelector:
matchLabels:
app.kubernetes.io/component: graphing
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: grafana
allowCrossNamespaceImport: true
datasource:
{{ toYaml . | nindent 8 }}
{{ end }}

View File

@ -51,4 +51,14 @@ spec:
serviceMonitorSelector: {}
storage:
{{- toYaml .Values.nfc_monitoring.prometheus.affinity | nindent 4 }}
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
thanos:
image: "{{ .Values.nfc_monitoring.thanos.image.name }}:{{ .Values.nfc_monitoring.thanos.image.tag }}"
objectStorageConfig:
key: thanos.yaml
name: thanos-sidecar-config
{{ end }}
version: 2.42.0
{{ if .Values.nfc_monitoring.prometheus.additional }}
{{ toYaml .Values.nfc_monitoring.prometheus.additional | nindent 2 }}
{{ end }}

View File

@ -0,0 +1,13 @@
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
---
apiVersion: v1
kind: Secret
metadata:
name: thanos-sidecar-config
namespace: monitoring
type: Opaque
stringData:
thanos.yaml: |-
{{ toYaml .Values.nfc_monitoring.thanos.sidecar.config | nindent 4 }}
{{ end }}

View File

@ -25,3 +25,35 @@ spec:
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: {{ $.Chart.Name }}
sessionAffinity: ClientIP
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus-sidecar
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: thanos-sidecar
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
app.kubernetes.io/version: {{ $.Chart.Version }}
name: thanos-sidecar
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
clusterIP: None
ports:
- name: grpc
port: 10901
targetPort: 10901
- name: http
port: 10902
targetPort: 10902
selector:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: {{ $.Chart.Name }}
{{ end }}

View File

@ -29,3 +29,34 @@ spec:
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: {{ $.Chart.Name }}
{{ if .Values.nfc_monitoring.thanos.sidecar.enabled }}
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: prometheus-sidecar
app.kubernetes.io/instance: thanos-sidecar
app.kubernetes.io/name: thanos-sidecar
app.kubernetes.io/version: v0.30.2
name: thanos-sidecar
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
endpoints:
- port: http
relabelings:
- action: replace
separator: /
sourceLabels:
- namespace
- pod
targetLabel: instance
selector:
matchLabels:
app.kubernetes.io/component: prometheus-sidecar
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: thanos-sidecar
app.kubernetes.io/part-of: {{ $.Chart.Name }}
{{ end }}

View File

@ -56,6 +56,67 @@ nfc_monitoring:
topologyKey: kubernetes.io/hostname
weight: 10
# To add Grafan datasources
# Type: list
# See: https://grafana.com/docs/grafana/latest/administration/provisioning/#data-sources
DataSources:
- name: alertmanager
type: alertmanager
access: proxy
url: "http://alertmanager-main.{{ .Values.nfc_monitoring.alert_manager.namespace }}.svc:9093"
isDefault: false
jsonData:
tlsSkipVerify: true
timeInterval: "5s"
implementation: prometheus
handleGrafanaManagedAlerts: false
orgId: 1
editable: true
- name: loki
type: loki
access: proxy
url: "http://{{ .Values.nfc_monitoring.loki.service_name }}.{{ .Values.nfc_monitoring.loki.namespace }}.svc.{{ .Values.nfc_monitoring.kubernetes.cluster_dns_name }}:{{ .Values.nfc_monitoring.loki.service_port }}"
isDefault: false
jsonData:
orgId: 1
editable: true
# - name: mimir
# type: prometheus
# access: proxy
# url: "http://mimir-gateway.metrics.svc.cluster.local/prometheus"
# isDefault: false
# jsonData:
# manageAlerts: true
# orgId: 1
# prometheusType: Mimir
# editable: true
- name: prometheus
type: prometheus
access: proxy
url: "http://prometheus-k8s.{{ .Values.nfc_monitoring.prometheus.namespace }}.svc:9090"
isDefault: true
jsonData:
manageAlerts: true
orgId: 1
prometheusType: Prometheus
prometheusVersion: 2.42.0
editable: true
- name: thanos
type: prometheus
access: proxy
url: "http://thanos-query.metrics.svc:9090"
isDefault: false
jsonData:
manageAlerts: true
orgId: 1
prometheusType: Thanos
prometheusVersion: 0.31.0
editable: true
grafana_agent:
image:
@ -66,7 +127,7 @@ nfc_monitoring:
loki:
enabled: false
enabled: true
image:
name: grafana/loki
@ -138,6 +199,7 @@ nfc_monitoring:
# List of namespaces that prometheus is to monitor
# used to create Roles and RoleBindings
# type: list
monitor_namespaces:
- alerting
- default
@ -167,6 +229,15 @@ nfc_monitoring:
requests:
storage: 40Gi
# Additional settings for Prometheus.
# See: https://prometheus-operator.dev/docs/operator/api/#monitoring.coreos.com/v1.PrometheusSpec
# Type: dict
additional: {}
# remoteWrite:
# - name: mimir
# url: http://mimir-gateway.metrics.svc.cluster.local/api/v1/push
prometheus_adaptor:
@ -201,6 +272,26 @@ nfc_monitoring:
topologyKey: kubernetes.io/hostname
weight: 10
thanos:
image:
name: thanosio/thanos
tag: v0.32.3
# Prometheus thanos sidecar
# see: https://thanos.io/tip/components/sidecar.md/
# Type: Dict
sidecar:
enabled: true
config:
type: S3
config:
bucket: "thanos-metrics"
endpoint: "rook-ceph-rgw-earth.ceph.svc:80"
access_key: "7J5NM2MNCDB4T4Y9OKJ5"
secret_key: "t9r69RzZdWEBL3NCKiUIpDk6j5625xc6HucusiGG"
additions: