feat(grafana_agent): add prom rules

!4
This commit is contained in:
2023-09-28 09:05:01 +09:30
parent 5bb7197129
commit f73a9e462e

View File

@ -9,7 +9,7 @@ metadata:
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: k8s
role: grafana-agent-promtail
role: alert-rules
name: grafana-agent
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
@ -21,3 +21,87 @@ spec:
- expr: |
agent_build_info
record: promtail_build_info
- name: promtail_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
by (job)
record: job:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
- name: promtail_alerts
rules:
- alert: PromtailRequestsErrors
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
- alert: PromtailRequestLatency
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
- alert: PromtailFileMissing
annotations:
message: |
{{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: warning