| @ -9,7 +9,7 @@ metadata: | |||||||
|     app.kubernetes.io/version: {{ $.Chart.Version }} |     app.kubernetes.io/version: {{ $.Chart.Version }} | ||||||
|     app.kubernetes.io/managed-by: {{ $.Release.Service }} |     app.kubernetes.io/managed-by: {{ $.Release.Service }} | ||||||
|     prometheus: k8s |     prometheus: k8s | ||||||
|     role: grafana-agent-promtail |     role: alert-rules | ||||||
|   name: grafana-agent |   name: grafana-agent | ||||||
|   namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} |   namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} | ||||||
| spec: | spec: | ||||||
| @ -21,3 +21,87 @@ spec: | |||||||
|     - expr: | |     - expr: | | ||||||
|         agent_build_info |         agent_build_info | ||||||
|       record: promtail_build_info |       record: promtail_build_info | ||||||
|  |   - name: promtail_rules | ||||||
|  |     rules: | ||||||
|  |     - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, job)) | ||||||
|  |       record: job:promtail_request_duration_seconds:99quantile | ||||||
|  |     - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, job)) | ||||||
|  |       record: job:promtail_request_duration_seconds:50quantile | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m])) | ||||||
|  |         by (job) | ||||||
|  |       record: job:promtail_request_duration_seconds:avg | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job) | ||||||
|  |       record: job:promtail_request_duration_seconds_bucket:sum_rate | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) | ||||||
|  |       record: job:promtail_request_duration_seconds_sum:sum_rate | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job) | ||||||
|  |       record: job:promtail_request_duration_seconds_count:sum_rate | ||||||
|  |     - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, job, namespace)) | ||||||
|  |       record: job_namespace:promtail_request_duration_seconds:99quantile | ||||||
|  |     - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, job, namespace)) | ||||||
|  |       record: job_namespace:promtail_request_duration_seconds:50quantile | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) | ||||||
|  |         / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) | ||||||
|  |       record: job_namespace:promtail_request_duration_seconds:avg | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace) | ||||||
|  |       record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) | ||||||
|  |       record: job_namespace:promtail_request_duration_seconds_sum:sum_rate | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) | ||||||
|  |       record: job_namespace:promtail_request_duration_seconds_count:sum_rate | ||||||
|  |     - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, job, status_code, namespace)) | ||||||
|  |       record: job_status_code_namespace:promtail_request_duration_seconds:99quantile | ||||||
|  |     - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) | ||||||
|  |         by (le, job, status_code, namespace)) | ||||||
|  |       record: job_status_code_namespace:promtail_request_duration_seconds:50quantile | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, | ||||||
|  |         namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, | ||||||
|  |         status_code, namespace) | ||||||
|  |       record: job_status_code_namespace:promtail_request_duration_seconds:avg | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, | ||||||
|  |         namespace) | ||||||
|  |       record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, | ||||||
|  |         namespace) | ||||||
|  |       record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate | ||||||
|  |     - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, | ||||||
|  |         namespace) | ||||||
|  |       record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate | ||||||
|  |  | ||||||
|  |   - name: promtail_alerts | ||||||
|  |     rules: | ||||||
|  |     - alert: PromtailRequestsErrors | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. | ||||||
|  |       expr: | | ||||||
|  |         100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) | ||||||
|  |           / | ||||||
|  |         sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) | ||||||
|  |           > 10 | ||||||
|  |       for: 15m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: PromtailRequestLatency | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. | ||||||
|  |       expr: | | ||||||
|  |         job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1 | ||||||
|  |       for: 15m | ||||||
|  |       labels: | ||||||
|  |         severity: critical | ||||||
|  |     - alert: PromtailFileMissing | ||||||
|  |       annotations: | ||||||
|  |         message: | | ||||||
|  |           {{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed. | ||||||
|  |       expr: | | ||||||
|  |         promtail_file_bytes_total unless promtail_read_bytes_total | ||||||
|  |       for: 15m | ||||||
|  |       labels: | ||||||
|  |         severity: warning | ||||||
		Reference in New Issue
	
	Block a user