--- {{- if .Values.nfc_monitoring.loki.enabled | default false -}} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: app.kubernetes.io/component: logging app.kubernetes.io/name: loki app.kubernetes.io/part-of: {{ $.Chart.Name }} app.kubernetes.io/version: {{ $.Chart.Version }} app.kubernetes.io/managed-by: {{ $.Release.Service }} prometheus: k8s role: alert-rules name: loki namespace: monitoring spec: groups: - name: loki_rules rules: - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) record: cluster_job:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) record: cluster_job:loki_request_duration_seconds:50quantile - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) record: cluster_job:loki_request_duration_seconds:avg - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) record: cluster_job:loki_request_duration_seconds_sum:sum_rate - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) record: cluster_job:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) record: cluster_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) record: cluster_job_route:loki_request_duration_seconds:50quantile - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) record: cluster_job_route:loki_request_duration_seconds:avg - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds:avg - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate - name: loki_alerts rules: - alert: LokiRequestErrors annotations: message: | {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. expr: | 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) > 10 for: 15m labels: severity: critical - alert: LokiRequestPanics annotations: message: | {{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics. expr: | sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 labels: severity: critical - alert: LokiRequestLatency annotations: message: | {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. expr: | cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 for: 15m labels: severity: critical - alert: LokiTooManyCompactorsRunning annotations: message: | {{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. expr: | sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 for: 5m labels: severity: warning {{- end -}}