106 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
			
		
		
	
	
			106 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			YAML
		
	
	
	
	
	
| ---
 | |
| apiVersion: monitoring.coreos.com/v1
 | |
| kind: PrometheusRule
 | |
| metadata:
 | |
|   labels:
 | |
|     {{ toYaml $.Values.nfc_monitoring.grafana_agent.labels | nindent 4 }}
 | |
|     app.kubernetes.io/part-of: {{ $.Chart.Name }}
 | |
|     app.kubernetes.io/version: {{ $.Chart.Version }}
 | |
|     app.kubernetes.io/managed-by: {{ $.Release.Service }}
 | |
|     prometheus: k8s
 | |
|     role: alert-rules
 | |
|   name: grafana-agent
 | |
|   namespace: {{ .Values.nfc_monitoring.prometheus.namespace }}
 | |
| spec:
 | |
|   groups:
 | |
|   - name: grafana_agent
 | |
|     rules:
 | |
|     # - annotations:
 | |
|     #     description: "As Grafana Agent is being used, it's version is set as promtails"
 | |
|     - expr: |
 | |
|         agent_build_info
 | |
|       record: promtail_build_info
 | |
|   - name: promtail_rules
 | |
|     rules:
 | |
|     - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 | |
|         by (le, job))
 | |
|       record: job:promtail_request_duration_seconds:99quantile
 | |
|     - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 | |
|         by (le, job))
 | |
|       record: job:promtail_request_duration_seconds:50quantile
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m]))
 | |
|         by (job)
 | |
|       record: job:promtail_request_duration_seconds:avg
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)
 | |
|       record: job:promtail_request_duration_seconds_bucket:sum_rate
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job)
 | |
|       record: job:promtail_request_duration_seconds_sum:sum_rate
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job)
 | |
|       record: job:promtail_request_duration_seconds_count:sum_rate
 | |
|     - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 | |
|         by (le, job, namespace))
 | |
|       record: job_namespace:promtail_request_duration_seconds:99quantile
 | |
|     - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 | |
|         by (le, job, namespace))
 | |
|       record: job_namespace:promtail_request_duration_seconds:50quantile
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
 | |
|         / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
 | |
|       record: job_namespace:promtail_request_duration_seconds:avg
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)
 | |
|       record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace)
 | |
|       record: job_namespace:promtail_request_duration_seconds_sum:sum_rate
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace)
 | |
|       record: job_namespace:promtail_request_duration_seconds_count:sum_rate
 | |
|     - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 | |
|         by (le, job, status_code, namespace))
 | |
|       record: job_status_code_namespace:promtail_request_duration_seconds:99quantile
 | |
|     - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m]))
 | |
|         by (le, job, status_code, namespace))
 | |
|       record: job_status_code_namespace:promtail_request_duration_seconds:50quantile
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
 | |
|         namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job,
 | |
|         status_code, namespace)
 | |
|       record: job_status_code_namespace:promtail_request_duration_seconds:avg
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code,
 | |
|         namespace)
 | |
|       record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code,
 | |
|         namespace)
 | |
|       record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate
 | |
|     - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code,
 | |
|         namespace)
 | |
|       record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate
 | |
| 
 | |
|   - name: promtail_alerts
 | |
|     rules:
 | |
|     - alert: PromtailRequestsErrors
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors.
 | |
|       expr: |
 | |
|         100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance)
 | |
|           /
 | |
|         sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance)
 | |
|           > 10
 | |
|       for: 15m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PromtailRequestLatency
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency.
 | |
|       expr: |
 | |
|         job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1
 | |
|       for: 15m
 | |
|       labels:
 | |
|         severity: critical
 | |
|     - alert: PromtailFileMissing
 | |
|       annotations:
 | |
|         message: |
 | |
|           {{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed.
 | |
|       expr: |
 | |
|         promtail_file_bytes_total unless promtail_read_bytes_total
 | |
|       for: 15m
 | |
|       labels:
 | |
|         severity: warning |