--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: app.kubernetes.io/component: exporter app.kubernetes.io/name: grafana-agent app.kubernetes.io/part-of: {{ $.Chart.Name }} app.kubernetes.io/version: {{ $.Chart.Version }} app.kubernetes.io/managed-by: {{ $.Release.Service }} prometheus: k8s role: alert-rules name: grafana-agent namespace: {{ .Values.nfc_monitoring.prometheus.namespace }} spec: groups: - name: grafana_agent rules: # - annotations: # description: "As Grafana Agent is being used, it's version is set as promtails" - expr: | agent_build_info record: promtail_build_info - name: promtail_rules rules: - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)) record: job:promtail_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job)) record: job:promtail_request_duration_seconds:50quantile - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job) record: job:promtail_request_duration_seconds:avg - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job) record: job:promtail_request_duration_seconds_bucket:sum_rate - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job) record: job:promtail_request_duration_seconds_sum:sum_rate - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job) record: job:promtail_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)) record: job_namespace:promtail_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace)) record: job_namespace:promtail_request_duration_seconds:50quantile - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) record: job_namespace:promtail_request_duration_seconds:avg - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, namespace) record: job_namespace:promtail_request_duration_seconds_bucket:sum_rate - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, namespace) record: job_namespace:promtail_request_duration_seconds_sum:sum_rate - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, namespace) record: job_namespace:promtail_request_duration_seconds_count:sum_rate - expr: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace)) record: job_status_code_namespace:promtail_request_duration_seconds:99quantile - expr: histogram_quantile(0.50, sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace)) record: job_status_code_namespace:promtail_request_duration_seconds:50quantile - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace) / sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace) record: job_status_code_namespace:promtail_request_duration_seconds:avg - expr: sum(rate(promtail_request_duration_seconds_bucket[1m])) by (le, job, status_code, namespace) record: job_status_code_namespace:promtail_request_duration_seconds_bucket:sum_rate - expr: sum(rate(promtail_request_duration_seconds_sum[1m])) by (job, status_code, namespace) record: job_status_code_namespace:promtail_request_duration_seconds_sum:sum_rate - expr: sum(rate(promtail_request_duration_seconds_count[1m])) by (job, status_code, namespace) record: job_status_code_namespace:promtail_request_duration_seconds_count:sum_rate - name: promtail_alerts rules: - alert: PromtailRequestsErrors annotations: message: | {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}% errors. expr: | 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 for: 15m labels: severity: critical - alert: PromtailRequestLatency annotations: message: | {{ `{{` }} $labels.job }} {{ `{{` }} $labels.route }} is experiencing {{ `{{` }} printf "%.2f" $value }}s 99th percentile latency. expr: | job_status_code_namespace:promtail_request_duration_seconds:99quantile > 1 for: 15m labels: severity: critical - alert: PromtailFileMissing annotations: message: | {{ `{{` }} $labels.instance }} {{ `{{` }} $labels.job }} {{ `{{` }} $labels.path }} matches the glob but is not being tailed. expr: | promtail_file_bytes_total unless promtail_read_bytes_total for: 15m labels: severity: warning