From f73a9e462e16dffbc0e17eaed3c0c78aff95d52b Mon Sep 17 00:00:00 2001 From: Jon Date: Thu, 28 Sep 2023 09:05:01 +0930 Subject: [PATCH] feat(grafana_agent): add prom rules !4 --- templates/PrometheusRule-grafana-agent.yaml | 86 ++++++++++++++++++++- 1 file changed, 85 insertions(+), 1 deletion(-) diff --git a/templates/PrometheusRule-grafana-agent.yaml b/templates/PrometheusRule-grafana-agent.yaml index 640733c..6b42288 100644 --- a/templates/PrometheusRule-grafana-agent.yaml +++ b/templates/PrometheusRule-grafana-agent.yaml @@ -9,7 +9,7 @@ metadata: app.kubernetes.io/version: {{ $.Chart.Version }} app.kubernetes.io/managed-by: {{ $.Release.Service }} prometheus: k8s - role: grafana-agent-promtail + role: alert-rules name: grafana-agent namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: @@ -21,3 +21,87 @@ spec: - expr: | agent_build_info record: promtail_build_info + - name: promtail_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) + by (le, job)) + record: job:promtail_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) + by (le, job)) + record: job:promtail_request_duration_seconds:50quantile + - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m])) + by (job) + record: job:promtail_request_duration_seconds:avg + - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job) + record: job:promtail_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) + record: job:promtail_request_duration_seconds_sum:sum_rate + - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job) + record: job:promtail_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) + by (le, job, namespace)) + record: job_namespace:promtail_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) + by (le, job, namespace)) + record: job_namespace:promtail_request_duration_seconds:50quantile + - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) + / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) + record: job_namespace:promtail_request_duration_seconds:avg + - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace) + record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) + record: job_namespace:promtail_request_duration_seconds_sum:sum_rate + - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) + record: job_namespace:promtail_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) + by (le, job, status_code, namespace)) + record: job_status_code_namespace:promtail_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) + by (le, job, status_code, namespace)) + record: job_status_code_namespace:promtail_request_duration_seconds:50quantile + - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, + namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, + status_code, namespace) + record: job_status_code_namespace:promtail_request_duration_seconds:avg + - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, + namespace) + record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, + namespace) + record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate + - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, + namespace) + record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate + + - name: promtail_alerts + rules: + - alert: PromtailRequestsErrors + annotations: + message: | + {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) + / + sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) + > 10 + for: 15m + labels: + severity: critical + - alert: PromtailRequestLatency + annotations: + message: | + {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. + expr: | + job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1 + for: 15m + labels: + severity: critical + - alert: PromtailFileMissing + annotations: + message: | + {{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed. + expr: | + promtail_file_bytes_total unless promtail_read_bytes_total + for: 15m + labels: + severity: warning \ No newline at end of file