Skip to content

Commit f91ffe8

Browse files
hyuuko1axboe
authored andcommitted
blk-iocost: fix busy_level reset when no IOs complete
When a disk is saturated, it is common for no IOs to complete within a timer period. Currently, in this case, rq_wait_pct and missed_ppm are calculated as 0, the iocost incorrectly interprets this as meeting QoS targets and resets busy_level to 0. This reset prevents busy_level from reaching the threshold (4) needed to reduce vrate. On certain cloud storage, such as Azure Premium SSD, we observed that iocost may fail to reduce vrate for tens of seconds during saturation, failing to mitigate noisy neighbor issues. Fix this by tracking the number of IO completions (nr_done) in a period. If nr_done is 0 and there are lagging IOs, the saturation status is unknown, so we keep busy_level unchanged. The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5) VMs with 512GB Premium SSD (P20) using the script below. It was not observed on GCP n2d VMs (with 100G pd-ssd and 1.5T local-ssd), and no regressions were found with this patch. In this script, cgA performs large IOs with iodepth=128, while cgB performs small IOs with iodepth=1 rate_iops=100 rw=randrw. With iocost enabled, we expect it to throttle cgA, the submission latency (slat) of cgA should be significantly higher, cgB can reach 200 IOPS and the completion latency (clat) should below. BLK_DEVID="8:0" MODEL="rbps=173471131 rseqiops=3566 rrandiops=3566 wbps=173333269 wseqiops=3566 wrandiops=3566" QOS="rpct=90 rlat=3500 wpct=90 wlat=3500 min=80 max=10000" echo "$BLK_DEVID ctrl=user model=linear $MODEL" > /sys/fs/cgroup/io.cost.model echo "$BLK_DEVID enable=1 ctrl=user $QOS" > /sys/fs/cgroup/io.cost.qos CG_A="/sys/fs/cgroup/cgA" CG_B="/sys/fs/cgroup/cgB" FILE_A="/path/to/sda/A.fio.testfile" FILE_B="/path/to/sda/B.fio.testfile" RESULT_DIR="./iocost_results_$(date +%Y%m%d_%H%M%S)" mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR" get_result() { local file=$1 local label=$2 local results=$(jq -r ' .jobs[0].mixed | ( .iops | tonumber | round ) as $iops | ( .bw_bytes / 1024 / 1024 ) as $bps | ( .slat_ns.mean / 1000000 ) as $slat | ( .clat_ns.mean / 1000000 ) as $avg | ( .clat_ns.max / 1000000 ) as $max | ( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 | ( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 | ( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 | ( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 | "\($iops)|\($bps)|\($slat)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)" ' "$file") IFS='|' read -r iops bps slat avg max p90 p99 p999 p9999 <<<"$results" printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \ "$label" "$iops" "$bps" "$slat" "$avg" "$max" "$p90" "$p99" "$p999" "$p9999" } run_fio() { local cg_path=$1 local filename=$2 local name=$3 local bs=$4 local qd=$5 local out=$6 shift 6 local extra=$@ ( pid=$(sh -c 'echo $PPID') echo $pid >"${cg_path}/cgroup.procs" fio --name="$name" --filename="$filename" --direct=1 --rw=randrw --rwmixread=50 \ --ioengine=libaio --bs="$bs" --iodepth="$qd" --size=4G --runtime=10 \ --time_based --group_reporting --unified_rw_reporting=mixed \ --output-format=json --output="$out" $extra >/dev/null 2>&1 ) & } echo "Starting Test ..." for bs_b in "4k" "32k" "256k"; do echo "Running iteration: BS=$bs_b" out_a="${RESULT_DIR}/cgA_1m.json" out_b="${RESULT_DIR}/cgB_${bs_b}.json" # cgA: Heavy background (BS 1MB, QD 128) run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a" # cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100) run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=100" wait SUMMARY_DATA+="$(get_result "$out_a" "cgA-1m")"$'\n' SUMMARY_DATA+="$(get_result "$out_b" "cgB-$bs_b")"$'\n\n' done echo -e "\nFinal Results Summary:\n" printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n" \ "" "" "" "slat" "clat" "clat" "clat" "clat" "clat" "clat" printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \ "CGROUP" "IOPS" "MB/s" "avg(ms)" "avg(ms)" "max(ms)" "P90(ms)" "P99" "P99.9" "P99.99" echo "$SUMMARY_DATA" echo "Results saved in $RESULT_DIR" Before: slat clat clat clat clat clat clat CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99 cgA-1m 166 166.37 3.44 748.95 1298.29 977.27 1233.13 1300.23 1300.23 cgB-4k 5 0.02 0.02 181.74 761.32 742.39 759.17 759.17 759.17 cgA-1m 167 166.51 1.98 748.68 1549.41 809.50 1451.23 1551.89 1551.89 cgB-32k 6 0.18 0.02 169.98 761.76 742.39 759.17 759.17 759.17 cgA-1m 166 165.55 2.89 750.89 1540.37 851.44 1451.23 1535.12 1535.12 cgB-256k 5 1.30 0.02 191.35 759.51 750.78 759.17 759.17 759.17 After: slat clat clat clat clat clat clat CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99 cgA-1m 162 162.48 6.14 749.69 850.02 826.28 834.67 843.06 851.44 cgB-4k 199 0.78 0.01 1.95 42.12 2.57 7.50 34.87 42.21 cgA-1m 146 146.20 6.83 833.04 908.68 893.39 901.78 910.16 910.16 cgB-32k 200 6.25 0.01 2.32 31.40 3.06 7.50 16.58 31.33 cgA-1m 110 110.46 9.04 1082.67 1197.91 1182.79 1199.57 1199.57 1199.57 cgB-256k 200 49.98 0.02 3.69 22.20 4.88 9.11 20.05 22.15 Signed-off-by: Jialin Wang <wjl.linux@gmail.com> Acked-by: Tejun Heo <tj@kernel.org> Link: https://patch.msgid.link/20260331100509.182882-1-wjl.linux@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 23308af commit f91ffe8

1 file changed

Lines changed: 17 additions & 6 deletions

File tree

block/blk-iocost.c

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
15961596
return HRTIMER_NORESTART;
15971597
}
15981598

1599-
static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1599+
static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p,
1600+
u32 *nr_done)
16001601
{
16011602
u32 nr_met[2] = { };
16021603
u32 nr_missed[2] = { };
@@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
16331634

16341635
*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
16351636
ioc->period_us * NSEC_PER_USEC);
1637+
1638+
*nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE];
16361639
}
16371640

16381641
/* was iocg idle this period? */
@@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer)
22502253
u64 usage_us_sum = 0;
22512254
u32 ppm_rthr;
22522255
u32 ppm_wthr;
2253-
u32 missed_ppm[2], rq_wait_pct;
2256+
u32 missed_ppm[2], rq_wait_pct, nr_done;
22542257
u64 period_vtime;
22552258
int prev_busy_level;
22562259

22572260
/* how were the latencies during the period? */
2258-
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2261+
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);
22592262

22602263
/* take care of active iocgs */
22612264
spin_lock_irq(&ioc->lock);
@@ -2397,9 +2400,17 @@ static void ioc_timer_fn(struct timer_list *timer)
23972400
* and should increase vtime rate.
23982401
*/
23992402
prev_busy_level = ioc->busy_level;
2400-
if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2401-
missed_ppm[READ] > ppm_rthr ||
2402-
missed_ppm[WRITE] > ppm_wthr) {
2403+
if (!nr_done && nr_lagging) {
2404+
/*
2405+
* When there are lagging IOs but no completions, we don't
2406+
* know if the IO latency will meet the QoS targets. The
2407+
* disk might be saturated or not. We should not reset
2408+
* busy_level to 0 (which would prevent vrate from scaling
2409+
* up or down), but rather to keep it unchanged.
2410+
*/
2411+
} else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2412+
missed_ppm[READ] > ppm_rthr ||
2413+
missed_ppm[WRITE] > ppm_wthr) {
24032414
/* clearly missing QoS targets, slow down vrate */
24042415
ioc->busy_level = max(ioc->busy_level, 0);
24052416
ioc->busy_level++;

0 commit comments

Comments
 (0)