From 944d6153312703f41f5d579b3326684c0f7312a7 Mon Sep 17 00:00:00 2001 From: Jon Date: Sat, 23 Sep 2023 23:11:38 +0930 Subject: [PATCH] fix(ceph): PromRule CephPGImbalance adjusted to group by node balancing is done by hostname not osd. !1 --- templates/prometheusRule-ceph.yaml | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/templates/prometheusRule-ceph.yaml b/templates/prometheusRule-ceph.yaml index 61c279b..e6a9a28 100644 --- a/templates/prometheusRule-ceph.yaml +++ b/templates/prometheusRule-ceph.yaml @@ -257,10 +257,25 @@ spec: description: "OSD {{ `{{` }} $labels.ceph_daemon }} on {{ `{{` }} $labels.hostname }} deviates by more than 30% from average PG count." summary: "PGs are not balanced across OSDs" expr: | - abs( - ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / - on (job) group_left avg(ceph_osd_numpg > 0) by (job) - ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + # abs( + # ((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / + # on (job) group_left avg(ceph_osd_numpg > 0) by (hostname) + # ) * on (ceph_daemon) group_left(hostname) ceph_osd_metadata > 0.30 + (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + < + scalar( + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + - + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) * 0.3 )) + ) + or + (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + > + scalar( + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) + + + (avg by (job) (sum by(hostname) ((ceph_osd_numpg > 0) * on(ceph_daemon) group_left(hostname) ceph_osd_metadata)) * 0.3 )) + ) for: "5m" labels: oid: "1.3.6.1.4.1.50495.1.2.1.4.5"