diff --git a/templates/PrometheusRule-grafana.yaml b/templates/PrometheusRule-grafana.yaml new file mode 100644 index 0000000..71f3d0d --- /dev/null +++ b/templates/PrometheusRule-grafana.yaml @@ -0,0 +1,35 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: {{ $.Chart.Name }} + app.kubernetes.io/version: {{ $.Chart.Version }} + app.kubernetes.io/managed-by: {{ $.Release.Service }} + prometheus: k8s + role: alert-rules + name: grafana-rules + namespace: monitoring +spec: + groups: + - name: GrafanaAlerts + rules: + - alert: GrafanaRequestsFailing + annotations: + message: '{{`{{`}} $labels.namespace }}/{{`{{`}} $labels.job }}/{{`{{`}} $labels.handler }} is experiencing {{`{{`}} $value | humanize }}% errors' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing + expr: | + 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} + / ignoring (status_code) + sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) + > 50 + for: 5m + labels: + severity: warning + - name: grafana_rules + rules: + - expr: | + sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) + record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m