116 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			116 lines
		
	
	
		
			5.7 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| ---
 | |
| {{- if .Values.nfc_monitoring.loki.enabled | default false -}}
 | |
| {{ if .Values.nfc_monitoring.loki.config }}
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PrometheusRule
 | |
| metadata:
 | |
|   labels:
 | |
|     app.kubernetes.io/component: logging
 | |
|     app.kubernetes.io/instance: {{ $.Release.Name }}
 | |
|     app.kubernetes.io/name: loki
 | |
|     app.kubernetes.io/part-of: {{ $.Chart.Name }}
 | |
|     app.kubernetes.io/version: {{ $.Chart.Version }}
 | |
|     app.kubernetes.io/managed-by: {{ $.Release.Service }}
 | |
|     prometheus: {{ $.Release.Name }}
 | |
|     role: alert-rules
 | |
|   name: loki
 | |
|   namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
 | |
| spec:
 | |
|   groups:
 | |
|   - name: loki_rules
 | |
|     rules:
 | |
|     - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
 | |
|         by (le, cluster, job))
 | |
|       record: cluster_job:loki_request_duration_seconds:99quantile
 | |
|     - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
 | |
|         by (le, cluster, job))
 | |
|       record: cluster_job:loki_request_duration_seconds:50quantile
 | |
|     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m]))
 | |
|         by (cluster, job)
 | |
|       record: cluster_job:loki_request_duration_seconds:avg
 | |
|     - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)
 | |
|       record: cluster_job:loki_request_duration_seconds_bucket:sum_rate
 | |
|     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)
 | |
|       record: cluster_job:loki_request_duration_seconds_sum:sum_rate
 | |
|     - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)
 | |
|       record: cluster_job:loki_request_duration_seconds_count:sum_rate
 | |
|     - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
 | |
|         by (le, cluster, job, route))
 | |
|       record: cluster_job_route:loki_request_duration_seconds:99quantile
 | |
|     - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
 | |
|         by (le, cluster, job, route))
 | |
|       record: cluster_job_route:loki_request_duration_seconds:50quantile
 | |
|     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
 | |
|         / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
 | |
|       record: cluster_job_route:loki_request_duration_seconds:avg
 | |
|     - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job,
 | |
|         route)
 | |
|       record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate
 | |
|     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)
 | |
|       record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate
 | |
|     - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)
 | |
|       record: cluster_job_route:loki_request_duration_seconds_count:sum_rate
 | |
|     - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
 | |
|         by (le, cluster, namespace, job, route))
 | |
|       record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile
 | |
|     - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
 | |
|         by (le, cluster, namespace, job, route))
 | |
|       record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile
 | |
|     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
 | |
|         job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster,
 | |
|         namespace, job, route)
 | |
|       record: cluster_namespace_job_route:loki_request_duration_seconds:avg
 | |
|     - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace,
 | |
|         job, route)
 | |
|       record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
 | |
|     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace,
 | |
|         job, route)
 | |
|       record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate
 | |
|     - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace,
 | |
|         job, route)
 | |
|       record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate
 | |
| 
 | |
|   - name: loki_alerts
 | |
|     rules:
 | |
|     - alert: LokiRequestErrors
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
 | |
|       expr: |
 | |
|         100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
 | |
|           /
 | |
|         sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
 | |
|           > 10
 | |
|       for: 15m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: LokiRequestPanics
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics.
 | |
|       expr: |
 | |
|         sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: LokiRequestLatency
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
 | |
|       expr: |
 | |
|         cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
 | |
|       for: 15m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: LokiTooManyCompactorsRunning
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
 | |
|       expr: |
 | |
|         sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
 | |
|       for: 5m
 | |
|       labels:
 | |
|         severity: warning
 | |
| 
 | |
| {{ end }}
 | |
| {{- end -}}
 |