blk-iocost: fix busy_level reset when no IOs complete

When a disk is saturated, it is common for no IOs to complete within a
timer period. Currently, in this case, rq_wait_pct and missed_ppm are
calculated as 0, the iocost incorrectly interprets this as meeting QoS
targets and resets busy_level to 0.

This reset prevents busy_level from reaching the threshold (4) needed
to reduce vrate. On certain cloud storage, such as Azure Premium SSD,
we observed that iocost may fail to reduce vrate for tens of seconds
during saturation, failing to mitigate noisy neighbor issues.

Fix this by tracking the number of IO completions (nr_done) in a period.
If nr_done is 0 and there are lagging IOs, the saturation status is
unknown, so we keep busy_level unchanged.

The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5)
VMs with 512GB Premium SSD (P20) using the script below. It was not
observed on GCP n2d VMs (with 100G pd-ssd and 1.5T local-ssd), and no
regressions were found with this patch. In this script, cgA performs
large IOs with iodepth=128, while cgB performs small IOs with iodepth=1
rate_iops=100 rw=randrw. With iocost enabled, we expect it to throttle
cgA, the submission latency (slat) of cgA should be significantly higher,
cgB can reach 200 IOPS and the completion latency (clat) should below.

BLK_DEVID="8:0"
MODEL="rbps=173471131 rseqiops=3566 rrandiops=3566 wbps=173333269 wseqiops=3566 wrandiops=3566"
QOS="rpct=90 rlat=3500 wpct=90 wlat=3500 min=80 max=10000"

echo "$BLK_DEVID ctrl=user model=linear $MODEL" > /sys/fs/cgroup/io.cost.model
echo "$BLK_DEVID enable=1 ctrl=user $QOS" > /sys/fs/cgroup/io.cost.qos

CG_A="/sys/fs/cgroup/cgA"
CG_B="/sys/fs/cgroup/cgB"

FILE_A="/path/to/sda/A.fio.testfile"
FILE_B="/path/to/sda/B.fio.testfile"
RESULT_DIR="./iocost_results_$(date +%Y%m%d_%H%M%S)"

mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR"

get_result() {
local file=$1
local label=$2

local results=$(jq -r '
.jobs[0].mixed |
( .iops | tonumber | round ) as $iops |
( .bw_bytes / 1024 / 1024 ) as $bps |
( .slat_ns.mean / 1000000 ) as $slat |
( .clat_ns.mean / 1000000 ) as $avg |
( .clat_ns.max / 1000000 ) as $max |
( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 |
( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 |
( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 |
( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 |
"\($iops)|\($bps)|\($slat)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)"
' "$file")

IFS='|' read -r iops bps slat avg max p90 p99 p999 p9999 <<<"$results"
printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \
"$label" "$iops" "$bps" "$slat" "$avg" "$max" "$p90" "$p99" "$p999" "$p9999"
}

run_fio() {
local cg_path=$1
local filename=$2
local name=$3
local bs=$4
local qd=$5
local out=$6
shift 6
local extra=$@

(
pid=$(sh -c 'echo $PPID')
echo $pid >"${cg_path}/cgroup.procs"
fio --name="$name" --filename="$filename" --direct=1 --rw=randrw --rwmixread=50 \
--ioengine=libaio --bs="$bs" --iodepth="$qd" --size=4G --runtime=10 \
--time_based --group_reporting --unified_rw_reporting=mixed \
--output-format=json --output="$out" $extra >/dev/null 2>&1
) &
}

echo "Starting Test ..."

for bs_b in "4k" "32k" "256k"; do
echo "Running iteration: BS=$bs_b"
out_a="${RESULT_DIR}/cgA_1m.json"
out_b="${RESULT_DIR}/cgB_${bs_b}.json"

# cgA: Heavy background (BS 1MB, QD 128)
run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a"
# cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100)
run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=100"

wait
SUMMARY_DATA+="$(get_result "$out_a" "cgA-1m")"$'\n'
SUMMARY_DATA+="$(get_result "$out_b" "cgB-$bs_b")"$'\n\n'
done

echo -e "\nFinal Results Summary:\n"

printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n" \
"" "" "" "slat" "clat" "clat" "clat" "clat" "clat" "clat"
printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \
"CGROUP" "IOPS" "MB/s" "avg(ms)" "avg(ms)" "max(ms)" "P90(ms)" "P99" "P99.9" "P99.99"
echo "$SUMMARY_DATA"

echo "Results saved in $RESULT_DIR"

Before:
slat clat clat clat clat clat clat
CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99

cgA-1m 166 166.37 3.44 748.95 1298.29 977.27 1233.13 1300.23 1300.23
cgB-4k 5 0.02 0.02 181.74 761.32 742.39 759.17 759.17 759.17

cgA-1m 167 166.51 1.98 748.68 1549.41 809.50 1451.23 1551.89 1551.89
cgB-32k 6 0.18 0.02 169.98 761.76 742.39 759.17 759.17 759.17

cgA-1m 166 165.55 2.89 750.89 1540.37 851.44 1451.23 1535.12 1535.12
cgB-256k 5 1.30 0.02 191.35 759.51 750.78 759.17 759.17 759.17

After:
slat clat clat clat clat clat clat
CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99

cgA-1m 162 162.48 6.14 749.69 850.02 826.28 834.67 843.06 851.44
cgB-4k 199 0.78 0.01 1.95 42.12 2.57 7.50 34.87 42.21

cgA-1m 146 146.20 6.83 833.04 908.68 893.39 901.78 910.16 910.16
cgB-32k 200 6.25 0.01 2.32 31.40 3.06 7.50 16.58 31.33

cgA-1m 110 110.46 9.04 1082.67 1197.91 1182.79 1199.57 1199.57 1199.57
cgB-256k 200 49.98 0.02 3.69 22.20 4.88 9.11 20.05 22.15

Signed-off-by: Jialin Wang <wjl.linux@gmail.com>
Acked-by: Tejun Heo <tj@kernel.org>
Link: https://patch.msgid.link/20260331100509.182882-1-wjl.linux@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Jialin Wang and committed by

Jens Axboe 1 month ago f91ffe89 23308af7

+17 -6

1 changed file

expand all

block

blk-iocost.c

+17 -6

block/blk-iocost.c

··· 1596 1596 return HRTIMER_NORESTART; 1597 1597 } 1598 1598 1599 - static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) 1599 + static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p, 1600 + u32 *nr_done) 1600 1601 { 1601 1602 u32 nr_met[2] = { }; 1602 1603 u32 nr_missed[2] = { }; ··· 1634 1633 1635 1634 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, 1636 1635 ioc->period_us * NSEC_PER_USEC); 1636 + 1637 + *nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE]; 1637 1638 } 1638 1639 1639 1640 /* was iocg idle this period? */ ··· 2253 2250 u64 usage_us_sum = 0; 2254 2251 u32 ppm_rthr; 2255 2252 u32 ppm_wthr; 2256 - u32 missed_ppm[2], rq_wait_pct; 2253 + u32 missed_ppm[2], rq_wait_pct, nr_done; 2257 2254 u64 period_vtime; 2258 2255 int prev_busy_level; 2259 2256 2260 2257 /* how were the latencies during the period? */ 2261 - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); 2258 + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done); 2262 2259 2263 2260 /* take care of active iocgs */ 2264 2261 spin_lock_irq(&ioc->lock); ··· 2400 2397 * and should increase vtime rate. 2401 2398 */ 2402 2399 prev_busy_level = ioc->busy_level; 2403 - if (rq_wait_pct > RQ_WAIT_BUSY_PCT || 2404 - missed_ppm[READ] > ppm_rthr || 2405 - missed_ppm[WRITE] > ppm_wthr) { 2400 + if (!nr_done && nr_lagging) { 2401 + /* 2402 + * When there are lagging IOs but no completions, we don't 2403 + * know if the IO latency will meet the QoS targets. The 2404 + * disk might be saturated or not. We should not reset 2405 + * busy_level to 0 (which would prevent vrate from scaling 2406 + * up or down), but rather to keep it unchanged. 2407 + */ 2408 + } else if (rq_wait_pct > RQ_WAIT_BUSY_PCT || 2409 + missed_ppm[READ] > ppm_rthr || 2410 + missed_ppm[WRITE] > ppm_wthr) { 2406 2411 /* clearly missing QoS targets, slow down vrate */ 2407 2412 ioc->busy_level = max(ioc->busy_level, 0); 2408 2413 ioc->busy_level++;

Configure Feed

Configure Feed