Files
kubernetes_monitoring/templates/PrometheusRule-grafana-agent.yaml
2023-09-28 09:05:01 +09:30

107 lines
5.3 KiB
YAML

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: grafana-agent
app.kubernetes.io/part-of: {{ $.Chart.Name }}
app.kubernetes.io/version: {{ $.Chart.Version }}
app.kubernetes.io/managed-by: {{ $.Release.Service }}
prometheus: k8s
role: alert-rules
name: grafana-agent
namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
spec:
groups:
- name: grafana_agent
rules:
# - annotations:
# description: "As Grafana Agent is being used, it's version is set as promtails"
- expr: |
agent_build_info
record: promtail_build_info
- name: promtail_rules
rules:
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
by (job)
record: job:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
record: job:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
record: job:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
record: job:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, namespace))
record: job_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
/ sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
record: job_namespace:promtail_request_duration_seconds_count:sum_rate
- expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
- expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
by (le, job, status_code, namespace))
record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
status_code, namespace)
record: job_status_code_namespace:promtail_request_duration_seconds:avg
- expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
- expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
namespace)
record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
- name: promtail_alerts
rules:
- alert: PromtailRequestsErrors
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
expr: |
100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
/
sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
> 10
for: 15m
labels:
severity: critical
- alert: PromtailRequestLatency
annotations:
message: |
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
expr: |
job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
for: 15m
labels:
severity: critical
- alert: PromtailFileMissing
annotations:
message: |
{{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed.
expr: |
promtail_file_bytes_total unless promtail_read_bytes_total
for: 15m
labels:
severity: warning