@ -9,7 +9,7 @@ metadata:
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
prometheus: k8s
|
||||
role: grafana-agent-promtail
|
||||
role: alert-rules
|
||||
name: grafana-agent
|
||||
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||
spec:
|
||||
@ -21,3 +21,87 @@ spec:
|
||||
- expr: |
|
||||
agent_build_info
|
||||
record: promtail_build_info
|
||||
- name: promtail_rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:promtail_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:promtail_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
|
||||
by (job)
|
||||
record: job:promtail_request_duration_seconds:avg
|
||||
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
|
||||
record: job:promtail_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:promtail_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:promtail_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, namespace))
|
||||
record: job_namespace:promtail_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, namespace))
|
||||
record: job_namespace:promtail_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
||||
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds:avg
|
||||
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
||||
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, status_code, namespace))
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, status_code, namespace))
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
||||
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
|
||||
status_code, namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds:avg
|
||||
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
|
||||
namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
||||
namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
|
||||
namespace)
|
||||
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
|
||||
|
||||
- name: promtail_alerts
|
||||
rules:
|
||||
- alert: PromtailRequestsErrors
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
|
||||
expr: |
|
||||
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
|
||||
/
|
||||
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PromtailRequestLatency
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
|
||||
expr: |
|
||||
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PromtailFileMissing
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed.
|
||||
expr: |
|
||||
promtail_file_bytes_total unless promtail_read_bytes_total
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
Reference in New Issue
Block a user