@ -9,7 +9,7 @@ metadata:
|
|||||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||||
prometheus: k8s
|
prometheus: k8s
|
||||||
role: grafana-agent-promtail
|
role: alert-rules
|
||||||
name: grafana-agent
|
name: grafana-agent
|
||||||
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
|
||||||
spec:
|
spec:
|
||||||
@ -21,3 +21,87 @@ spec:
|
|||||||
- expr: |
|
- expr: |
|
||||||
agent_build_info
|
agent_build_info
|
||||||
record: promtail_build_info
|
record: promtail_build_info
|
||||||
|
- name: promtail_rules
|
||||||
|
rules:
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, job))
|
||||||
|
record: job:promtail_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, job))
|
||||||
|
record: job:promtail_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
|
||||||
|
by (job)
|
||||||
|
record: job:promtail_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
|
||||||
|
record: job:promtail_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
|
||||||
|
record: job:promtail_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
|
||||||
|
record: job:promtail_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, job, namespace))
|
||||||
|
record: job_namespace:promtail_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, job, namespace))
|
||||||
|
record: job_namespace:promtail_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
||||||
|
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
||||||
|
record: job_namespace:promtail_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
|
||||||
|
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
||||||
|
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
||||||
|
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
|
||||||
|
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, job, status_code, namespace))
|
||||||
|
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
|
||||||
|
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
||||||
|
by (le, job, status_code, namespace))
|
||||||
|
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
||||||
|
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
|
||||||
|
status_code, namespace)
|
||||||
|
record: job_status_code_namespace:promtail_request_duration_seconds:avg
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
|
||||||
|
namespace)
|
||||||
|
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
||||||
|
namespace)
|
||||||
|
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
|
||||||
|
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
|
||||||
|
namespace)
|
||||||
|
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
|
||||||
|
|
||||||
|
- name: promtail_alerts
|
||||||
|
rules:
|
||||||
|
- alert: PromtailRequestsErrors
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
|
||||||
|
expr: |
|
||||||
|
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
|
||||||
|
/
|
||||||
|
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
|
||||||
|
> 10
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PromtailRequestLatency
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
|
||||||
|
expr: |
|
||||||
|
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: PromtailFileMissing
|
||||||
|
annotations:
|
||||||
|
message: |
|
||||||
|
{{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed.
|
||||||
|
expr: |
|
||||||
|
promtail_file_bytes_total unless promtail_read_bytes_total
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
Reference in New Issue
Block a user