mirror of
https://github.com/nofusscomputing/kubernetes.git
synced 2025-08-02 04:22:42 +00:00
116 lines
5.5 KiB
YAML
116 lines
5.5 KiB
YAML
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
labels:
|
|
app.kubernetes.io/component: exporter
|
|
app.kubernetes.io/instance: cluster
|
|
prometheus: prometheus
|
|
role: alert-rules
|
|
name: grafana-agent
|
|
spec:
|
|
groups:
|
|
- name: grafana_agent
|
|
rules:
|
|
# - annotations:
|
|
# description: "As Grafana Agent is being used, it's version is set as promtails"
|
|
- expr: |
|
|
agent_build_info
|
|
record: promtail_build_info
|
|
- name: promtail_rules
|
|
rules:
|
|
- expr:
|
|
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
|
by (le, job))
|
|
record: job:promtail_request_duration_seconds:99quantile
|
|
- expr:
|
|
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
|
by (le, job))
|
|
record: job:promtail_request_duration_seconds:50quantile
|
|
- expr:
|
|
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
|
|
by (job)
|
|
record: job:promtail_request_duration_seconds:avg
|
|
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
|
|
record: job:promtail_request_duration_seconds_bucket:sum_rate
|
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
|
|
record: job:promtail_request_duration_seconds_sum:sum_rate
|
|
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
|
|
record: job:promtail_request_duration_seconds_count:sum_rate
|
|
- expr:
|
|
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
|
by (le, job, namespace))
|
|
record: job_namespace:promtail_request_duration_seconds:99quantile
|
|
- expr:
|
|
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
|
by (le, job, namespace))
|
|
record: job_namespace:promtail_request_duration_seconds:50quantile
|
|
- expr:
|
|
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
|
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
|
record: job_namespace:promtail_request_duration_seconds:avg
|
|
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
|
|
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
|
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
|
|
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
|
|
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
|
|
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
|
|
- expr:
|
|
histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
|
by (le, job, status_code, namespace))
|
|
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
|
|
- expr:
|
|
histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
|
|
by (le, job, status_code, namespace))
|
|
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
|
|
- expr:
|
|
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
|
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
|
|
status_code, namespace)
|
|
record: job_status_code_namespace:promtail_request_duration_seconds:avg
|
|
- expr:
|
|
sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
|
|
namespace)
|
|
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
|
|
- expr:
|
|
sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
|
|
namespace)
|
|
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
|
|
- expr:
|
|
sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
|
|
namespace)
|
|
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
|
|
|
|
- name: promtail_alerts
|
|
rules:
|
|
- alert: PromtailRequestsErrors
|
|
annotations:
|
|
message: |
|
|
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
|
expr: |
|
|
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
|
|
/
|
|
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
|
|
> 10
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PromtailRequestLatency
|
|
annotations:
|
|
message: |
|
|
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
|
expr: |
|
|
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PromtailFileMissing
|
|
annotations:
|
|
message: |
|
|
{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} matches the glob but is not being tailed.
|
|
expr: |
|
|
promtail_file_bytes_total unless promtail_read_bytes_total
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|