--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: {{ toYaml $.Values.nfc_monitoring.grafana.labels | nindent 4 }} app.kubernetes.io/part-of: {{ $.Chart.Name }} app.kubernetes.io/version: {{ $.Chart.Version }} app.kubernetes.io/managed-by: {{ $.Release.Service }} prometheus: k8s role: alert-rules name: grafana-rules namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: GrafanaAlerts rules: - alert: GrafanaRequestsFailing annotations: message: '{{`{{`}} $labels.namespace }}/{{`{{`}} $labels.job }}/{{`{{`}} $labels.handler }} is experiencing {{`{{`}} $value | humanize }}% errors' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/grafana/grafanarequestsfailing expr: | 100 * namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query", status_code=~"5.."} / ignoring (status_code) sum without (status_code) (namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m{handler!~"/api/datasources/proxy/:id.*|/api/ds/query|/api/tsdb/query"}) > 50 for: 5m labels: severity: warning - name: grafana_rules rules: - expr: | sum by (namespace, job, handler, status_code) (rate(grafana_http_request_duration_seconds_count[5m])) record: namespace_job_handler_statuscode:grafana_http_request_duration_seconds_count:rate5m