Skip to content

Commit 320f9b1

Browse files
ps-ushankaraxboe
authored andcommitted
selftests: ublk: test that teardown after incomplete recovery completes
Before the fix, teardown of a ublk server that was attempting to recover a device, but died when it had submitted a nonempty proper subset of the fetch commands to any queue would loop forever. Add a test to verify that, after the fix, teardown completes. This is done by: - Adding a new argument to the fault_inject target that causes it die after fetching a nonempty proper subset of the IOs to a queue - Using that argument in a new test while trying to recover an already-created device - Attempting to delete the ublk device at the end of the test; this hangs forever if teardown from the fault-injected ublk server never completed. It was manually verified that the test passes with the fix and hangs without it. Signed-off-by: Uday Shankar <ushankar@purestorage.com> Reviewed-by: Ming Lei <ming.lei@redhat.com> Link: https://patch.msgid.link/20260405-cancel-v2-2-02d711e643c2@purestorage.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 0842186 commit 320f9b1

5 files changed

Lines changed: 95 additions & 3 deletions

File tree

tools/testing/selftests/ublk/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ TEST_PROGS += test_generic_10.sh
1818
TEST_PROGS += test_generic_12.sh
1919
TEST_PROGS += test_generic_13.sh
2020
TEST_PROGS += test_generic_16.sh
21+
TEST_PROGS += test_generic_17.sh
2122

2223
TEST_PROGS += test_batch_01.sh
2324
TEST_PROGS += test_batch_02.sh

tools/testing/selftests/ublk/fault_inject.c

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,17 @@
1010

1111
#include "kublk.h"
1212

13+
struct fi_opts {
14+
long long delay_ns;
15+
bool die_during_fetch;
16+
};
17+
1318
static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
1419
struct ublk_dev *dev)
1520
{
1621
const struct ublksrv_ctrl_dev_info *info = &dev->dev_info;
1722
unsigned long dev_size = 250UL << 30;
23+
struct fi_opts *opts = NULL;
1824

1925
if (ctx->auto_zc_fallback) {
2026
ublk_err("%s: not support auto_zc_fallback\n", __func__);
@@ -35,17 +41,52 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx,
3541
};
3642
ublk_set_integrity_params(ctx, &dev->tgt.params);
3743

38-
dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000);
44+
opts = calloc(1, sizeof(*opts));
45+
if (!opts) {
46+
ublk_err("%s: couldn't allocate memory for opts\n", __func__);
47+
return -ENOMEM;
48+
}
49+
50+
opts->delay_ns = ctx->fault_inject.delay_us * 1000;
51+
opts->die_during_fetch = ctx->fault_inject.die_during_fetch;
52+
dev->private_data = opts;
53+
3954
return 0;
4055
}
4156

57+
static void ublk_fault_inject_pre_fetch_io(struct ublk_thread *t,
58+
struct ublk_queue *q, int tag,
59+
bool batch)
60+
{
61+
struct fi_opts *opts = q->dev->private_data;
62+
63+
if (!opts->die_during_fetch)
64+
return;
65+
66+
/*
67+
* Each queue fetches its IOs in increasing order of tags, so
68+
* dying just before we're about to fetch tag 1 (regardless of
69+
* what queue we're on) guarantees that we've fetched a nonempty
70+
* proper subset of the tags on that queue.
71+
*/
72+
if (tag == 1) {
73+
/*
74+
* Ensure our commands are actually live in the kernel
75+
* before we die.
76+
*/
77+
io_uring_submit(&t->ring);
78+
raise(SIGKILL);
79+
}
80+
}
81+
4282
static int ublk_fault_inject_queue_io(struct ublk_thread *t,
4383
struct ublk_queue *q, int tag)
4484
{
4585
const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag);
4686
struct io_uring_sqe *sqe;
87+
struct fi_opts *opts = q->dev->private_data;
4788
struct __kernel_timespec ts = {
48-
.tv_nsec = (long long)q->dev->private_data,
89+
.tv_nsec = opts->delay_ns,
4990
};
5091

5192
ublk_io_alloc_sqes(t, &sqe, 1);
@@ -77,29 +118,34 @@ static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv
77118
{
78119
static const struct option longopts[] = {
79120
{ "delay_us", 1, NULL, 0 },
121+
{ "die_during_fetch", 1, NULL, 0 },
80122
{ 0, 0, 0, 0 }
81123
};
82124
int option_idx, opt;
83125

84126
ctx->fault_inject.delay_us = 0;
127+
ctx->fault_inject.die_during_fetch = false;
85128
while ((opt = getopt_long(argc, argv, "",
86129
longopts, &option_idx)) != -1) {
87130
switch (opt) {
88131
case 0:
89132
if (!strcmp(longopts[option_idx].name, "delay_us"))
90133
ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10);
134+
if (!strcmp(longopts[option_idx].name, "die_during_fetch"))
135+
ctx->fault_inject.die_during_fetch = strtoll(optarg, NULL, 10);
91136
}
92137
}
93138
}
94139

95140
static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops)
96141
{
97-
printf("\tfault_inject: [--delay_us us (default 0)]\n");
142+
printf("\tfault_inject: [--delay_us us (default 0)] [--die_during_fetch 1]\n");
98143
}
99144

100145
const struct ublk_tgt_ops fault_inject_tgt_ops = {
101146
.name = "fault_inject",
102147
.init_tgt = ublk_fault_inject_tgt_init,
148+
.pre_fetch_io = ublk_fault_inject_pre_fetch_io,
103149
.queue_io = ublk_fault_inject_queue_io,
104150
.tgt_io_done = ublk_fault_inject_tgt_io_done,
105151
.parse_cmd_line = ublk_fault_inject_cmd_line,

tools/testing/selftests/ublk/kublk.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t)
796796
q = &t->dev->q[q_id];
797797
io = &q->ios[tag];
798798
io->buf_index = j++;
799+
if (q->tgt_ops->pre_fetch_io)
800+
q->tgt_ops->pre_fetch_io(t, q, tag, false);
799801
ublk_queue_io_cmd(t, io);
800802
}
801803
} else {
@@ -807,6 +809,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t)
807809
for (i = 0; i < q->q_depth; i++) {
808810
io = &q->ios[i];
809811
io->buf_index = i;
812+
if (q->tgt_ops->pre_fetch_io)
813+
q->tgt_ops->pre_fetch_io(t, q, i, false);
810814
ublk_queue_io_cmd(t, io);
811815
}
812816
}
@@ -983,6 +987,9 @@ static void ublk_batch_setup_queues(struct ublk_thread *t)
983987
if (t->q_map[i] == 0)
984988
continue;
985989

990+
if (q->tgt_ops->pre_fetch_io)
991+
q->tgt_ops->pre_fetch_io(t, q, 0, true);
992+
986993
ret = ublk_batch_queue_prep_io_cmds(t, q);
987994
ublk_assert(ret >= 0);
988995
}

tools/testing/selftests/ublk/kublk.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ struct stripe_ctx {
6060
struct fault_inject_ctx {
6161
/* fault_inject */
6262
unsigned long delay_us;
63+
bool die_during_fetch;
6364
};
6465

6566
struct dev_ctx {
@@ -138,6 +139,8 @@ struct ublk_tgt_ops {
138139
int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *);
139140
void (*deinit_tgt)(struct ublk_dev *);
140141

142+
void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q,
143+
int tag, bool batch);
141144
int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag);
142145
void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *,
143146
const struct io_uring_cqe *);
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: GPL-2.0
3+
4+
. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh
5+
6+
ERR_CODE=0
7+
8+
_prep_test "fault_inject" "teardown after incomplete recovery"
9+
10+
# First start and stop a ublk server with device configured for recovery
11+
dev_id=$(_add_ublk_dev -t fault_inject -r 1)
12+
_check_add_dev $TID $?
13+
state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED")
14+
if [ "$state" != "QUIESCED" ]; then
15+
echo "device isn't quiesced($state) after $action"
16+
ERR_CODE=255
17+
fi
18+
19+
# Then recover the device, but use --die_during_fetch to have the ublk
20+
# server die while a queue has some (but not all) I/Os fetched
21+
${UBLK_PROG} recover -n "${dev_id}" --foreground -t fault_inject --die_during_fetch 1
22+
RECOVER_RES=$?
23+
# 137 is the result when dying of SIGKILL
24+
if (( RECOVER_RES != 137 )); then
25+
echo "recover command exited with unexpected code ${RECOVER_RES}!"
26+
ERR_CODE=255
27+
fi
28+
29+
# Clean up the device. This can only succeed once teardown of the above
30+
# exited ublk server completes. So if teardown never completes, we will
31+
# time out here
32+
_ublk_del_dev "${dev_id}"
33+
34+
_cleanup_test "fault_inject"
35+
_show_result $TID $ERR_CODE

0 commit comments

Comments
 (0)