112
templates/PrometheusRule-loki.yaml
Normal file
112
templates/PrometheusRule-loki.yaml
Normal file
@ -0,0 +1,112 @@
|
||||
---
|
||||
{{- if .Values.nfc_monitoring.loki.enabled | default false -}}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: logging
|
||||
app.kubernetes.io/name: loki
|
||||
app.kubernetes.io/part-of: {{ $.Chart.Name }}
|
||||
app.kubernetes.io/version: {{ $.Chart.Version }}
|
||||
app.kubernetes.io/managed-by: {{ $.Release.Service }}
|
||||
prometheus: k8s
|
||||
role: alert-rules
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: loki_rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:loki_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job))
|
||||
record: cluster_job:loki_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (cluster, job)
|
||||
record: cluster_job:loki_request_duration_seconds:avg
|
||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
|
||||
record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
|
||||
record: cluster_job:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
|
||||
record: cluster_job:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, route))
|
||||
record: cluster_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, job, route))
|
||||
record: cluster_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||
/ sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||
record: cluster_job_route:loki_request_duration_seconds:avg
|
||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
|
||||
route)
|
||||
record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
|
||||
record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
|
||||
record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, namespace, job, route))
|
||||
record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, cluster, namespace, job, route))
|
||||
record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
|
||||
namespace, job, route)
|
||||
record: cluster_namespace_job_route:loki_request_duration_seconds:avg
|
||||
- expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
|
||||
job, route)
|
||||
record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
|
||||
job, route)
|
||||
record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
|
||||
job, route)
|
||||
record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
||||
|
||||
- name: loki_alerts
|
||||
rules:
|
||||
- alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics.
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
|
||||
expr: |
|
||||
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiTooManyCompactorsRunning
|
||||
annotations:
|
||||
message: |
|
||||
{{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|
||||
expr: |
|
||||
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
{{- end -}}
|
Reference in New Issue
Block a user