112
									
								
								templates/PrometheusRule-loki.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								templates/PrometheusRule-loki.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,112 @@ | |||||||
|  | --- | ||||||
|  | {{- if .Values.nfc_monitoring.loki.enabled | default false -}} | ||||||
|  | apiVersion: monitoring.coreos.com/v1 | ||||||
|  | kind: PrometheusRule | ||||||
|  | metadata: | ||||||
|  |   labels: | ||||||
|  |     app.kubernetes.io/component: logging | ||||||
|  |     app.kubernetes.io/name: loki | ||||||
|  |     app.kubernetes.io/part-of: {{ $.Chart.Name }} | ||||||
|  |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|  |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|  |     prometheus: k8s | ||||||
|  |     role: alert-rules | ||||||
|  |   name: loki | ||||||
|  |   namespace: monitoring | ||||||
|  | spec: | ||||||
|  |   groups: | ||||||
|  |   - name: loki_rules | ||||||
|  |     rules: | ||||||
|  |     - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, cluster, job)) | ||||||
|  |       record: cluster_job:loki_request_duration_seconds:99quantile | ||||||
|  |     - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, cluster, job)) | ||||||
|  |       record: cluster_job:loki_request_duration_seconds:50quantile | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) | ||||||
|  |         by (cluster, job) | ||||||
|  |       record: cluster_job:loki_request_duration_seconds:avg | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) | ||||||
|  |       record: cluster_job:loki_request_duration_seconds_bucket:sum_rate | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) | ||||||
|  |       record: cluster_job:loki_request_duration_seconds_sum:sum_rate | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) | ||||||
|  |       record: cluster_job:loki_request_duration_seconds_count:sum_rate | ||||||
|  |     - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, cluster, job, route)) | ||||||
|  |       record: cluster_job_route:loki_request_duration_seconds:99quantile | ||||||
|  |     - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, cluster, job, route)) | ||||||
|  |       record: cluster_job_route:loki_request_duration_seconds:50quantile | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) | ||||||
|  |         / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) | ||||||
|  |       record: cluster_job_route:loki_request_duration_seconds:avg | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, | ||||||
|  |         route) | ||||||
|  |       record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) | ||||||
|  |       record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) | ||||||
|  |       record: cluster_job_route:loki_request_duration_seconds_count:sum_rate | ||||||
|  |     - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, cluster, namespace, job, route)) | ||||||
|  |       record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile | ||||||
|  |     - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, cluster, namespace, job, route)) | ||||||
|  |       record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, | ||||||
|  |         job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, | ||||||
|  |         namespace, job, route) | ||||||
|  |       record: cluster_namespace_job_route:loki_request_duration_seconds:avg | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, | ||||||
|  |         job, route) | ||||||
|  |       record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, | ||||||
|  |         job, route) | ||||||
|  |       record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate | ||||||
|  |     - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, | ||||||
|  |         job, route) | ||||||
|  |       record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate | ||||||
|  |  | ||||||
|  |   - name: loki_alerts | ||||||
|  |     rules: | ||||||
|  |     - alert: LokiRequestErrors | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. | ||||||
|  |       expr: | | ||||||
|  |         100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) | ||||||
|  |           / | ||||||
|  |         sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) | ||||||
|  |           > 10 | ||||||
|  |       for: 15m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: LokiRequestPanics | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.job }} is experiencing {{ `{{` }} printf "%.2f" $value }}% increase of panics. | ||||||
|  |       expr: | | ||||||
|  |         sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: LokiRequestLatency | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. | ||||||
|  |       expr: | | ||||||
|  |         cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 | ||||||
|  |       for: 15m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: LokiTooManyCompactorsRunning | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.cluster }} {{ `{{` }} $labels.namespace }} has had {{ `{{` }} printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. | ||||||
|  |       expr: | | ||||||
|  |         sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 | ||||||
|  |       for: 5m | ||||||
|  |       labels: | ||||||
|  |         severity: warning | ||||||
|  |  | ||||||
|  | {{- end -}} | ||||||
		Reference in New Issue
	
	Block a user