Skip to content

Commit 61a11cf

Browse files
committed
io_uring: protect remaining lockless ctx->rings accesses with RCU
Commit 9618908 addressed one case of ctx->rings being potentially accessed while a resize is happening on the ring, but there are still a few others that need handling. Add a helper for retrieving the rings associated with an io_uring context, and add some sanity checking to that to catch bad uses. ->rings_rcu is always valid, as long as it's used within RCU read lock. Any use of ->rings_rcu or ->rings inside either ->uring_lock or ->completion_lock is sane as well. Do the minimum fix for the current kernel, but set it up such that this basic infra can be extended for later kernels to make this harder to mess up in the future. Thanks to Junxi Qian for finding and debugging this issue. Cc: stable@vger.kernel.org Fixes: 79cfe9e ("io_uring/register: add IORING_REGISTER_RESIZE_RINGS") Reviewed-by: Junxi Qian <qjx1298677004@gmail.com> Tested-by: Junxi Qian <qjx1298677004@gmail.com> Link: https://lore.kernel.org/io-uring/20260330172348.89416-1-qjx1298677004@gmail.com/ Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 111a12b commit 61a11cf

4 files changed

Lines changed: 70 additions & 28 deletions

File tree

io_uring/io_uring.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2015,7 +2015,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
20152015
if (ctx->flags & IORING_SETUP_SQ_REWIND)
20162016
entries = ctx->sq_entries;
20172017
else
2018-
entries = io_sqring_entries(ctx);
2018+
entries = __io_sqring_entries(ctx);
20192019

20202020
entries = min(nr, entries);
20212021
if (unlikely(!entries))
@@ -2250,7 +2250,9 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
22502250
*/
22512251
poll_wait(file, &ctx->poll_wq, wait);
22522252

2253-
if (!io_sqring_full(ctx))
2253+
rcu_read_lock();
2254+
2255+
if (!__io_sqring_full(ctx))
22542256
mask |= EPOLLOUT | EPOLLWRNORM;
22552257

22562258
/*
@@ -2270,6 +2272,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
22702272
if (__io_cqring_events_user(ctx) || io_has_work(ctx))
22712273
mask |= EPOLLIN | EPOLLRDNORM;
22722274

2275+
rcu_read_unlock();
22732276
return mask;
22742277
}
22752278

io_uring/io_uring.h

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,16 +142,28 @@ struct io_wait_queue {
142142
#endif
143143
};
144144

145+
static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx)
146+
{
147+
return rcu_dereference_check(ctx->rings_rcu,
148+
lockdep_is_held(&ctx->uring_lock) ||
149+
lockdep_is_held(&ctx->completion_lock));
150+
}
151+
145152
static inline bool io_should_wake(struct io_wait_queue *iowq)
146153
{
147154
struct io_ring_ctx *ctx = iowq->ctx;
148-
int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
155+
struct io_rings *rings;
156+
int dist;
157+
158+
guard(rcu)();
159+
rings = io_get_rings(ctx);
149160

150161
/*
151162
* Wake up if we have enough events, or if a timeout occurred since we
152163
* started waiting. For timeouts, we always want to return to userspace,
153164
* regardless of event count.
154165
*/
166+
dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail;
155167
return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
156168
}
157169

@@ -431,9 +443,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx)
431443
__io_wq_wake(&ctx->cq_wait);
432444
}
433445

434-
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
446+
static inline bool __io_sqring_full(struct io_ring_ctx *ctx)
435447
{
436-
struct io_rings *r = ctx->rings;
448+
struct io_rings *r = io_get_rings(ctx);
437449

438450
/*
439451
* SQPOLL must use the actual sqring head, as using the cached_sq_head
@@ -445,16 +457,28 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
445457
return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries;
446458
}
447459

448-
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
460+
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
449461
{
450-
struct io_rings *rings = ctx->rings;
462+
guard(rcu)();
463+
return __io_sqring_full(ctx);
464+
}
465+
466+
static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx)
467+
{
468+
struct io_rings *rings = io_get_rings(ctx);
451469
unsigned int entries;
452470

453471
/* make sure SQ entry isn't read before tail */
454472
entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
455473
return min(entries, ctx->sq_entries);
456474
}
457475

476+
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
477+
{
478+
guard(rcu)();
479+
return __io_sqring_entries(ctx);
480+
}
481+
458482
/*
459483
* Don't complete immediately but use deferred completion infrastructure.
460484
* Protected by ->uring_lock and can only be used either with

io_uring/wait.c

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,15 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
7979
if (io_has_work(ctx))
8080
goto out_wake;
8181
/* got events since we started waiting, min timeout is done */
82-
if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail))
83-
goto out_wake;
84-
/* if we have any events and min timeout expired, we're done */
85-
if (io_cqring_events(ctx))
86-
goto out_wake;
82+
scoped_guard(rcu) {
83+
struct io_rings *rings = io_get_rings(ctx);
8784

85+
if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail))
86+
goto out_wake;
87+
/* if we have any events and min timeout expired, we're done */
88+
if (io_cqring_events(ctx))
89+
goto out_wake;
90+
}
8891
/*
8992
* If using deferred task_work running and application is waiting on
9093
* more than one request, ensure we reset it now where we are switching
@@ -186,9 +189,9 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
186189
struct ext_arg *ext_arg)
187190
{
188191
struct io_wait_queue iowq;
189-
struct io_rings *rings = ctx->rings;
192+
struct io_rings *rings;
190193
ktime_t start_time;
191-
int ret;
194+
int ret, nr_wait;
192195

193196
min_events = min_t(int, min_events, ctx->cq_entries);
194197

@@ -201,15 +204,23 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
201204

202205
if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
203206
io_cqring_do_overflow_flush(ctx);
204-
if (__io_cqring_events_user(ctx) >= min_events)
207+
208+
rcu_read_lock();
209+
rings = io_get_rings(ctx);
210+
if (__io_cqring_events_user(ctx) >= min_events) {
211+
rcu_read_unlock();
205212
return 0;
213+
}
206214

207215
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
208216
iowq.wq.private = current;
209217
INIT_LIST_HEAD(&iowq.wq.entry);
210218
iowq.ctx = ctx;
211-
iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events;
212-
iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail);
219+
iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events;
220+
iowq.cq_min_tail = READ_ONCE(rings->cq.tail);
221+
nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail);
222+
rcu_read_unlock();
223+
rings = NULL;
213224
iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
214225
iowq.hit_timeout = 0;
215226
iowq.min_timeout = ext_arg->min_time;
@@ -240,14 +251,6 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
240251
trace_io_uring_cqring_wait(ctx, min_events);
241252
do {
242253
unsigned long check_cq;
243-
int nr_wait;
244-
245-
/* if min timeout has been hit, don't reset wait count */
246-
if (!iowq.hit_timeout)
247-
nr_wait = (int) iowq.cq_tail -
248-
READ_ONCE(ctx->rings->cq.tail);
249-
else
250-
nr_wait = 1;
251254

252255
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
253256
atomic_set(&ctx->cq_wait_nr, nr_wait);
@@ -298,11 +301,20 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
298301
break;
299302
}
300303
cond_resched();
304+
305+
/* if min timeout has been hit, don't reset wait count */
306+
if (!iowq.hit_timeout)
307+
scoped_guard(rcu)
308+
nr_wait = (int) iowq.cq_tail -
309+
READ_ONCE(io_get_rings(ctx)->cq.tail);
310+
else
311+
nr_wait = 1;
301312
} while (1);
302313

303314
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
304315
finish_wait(&ctx->cq_wait, &iowq.wq);
305316
restore_saved_sigmask_unless(ret == -EINTR);
306317

307-
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
318+
guard(rcu)();
319+
return READ_ONCE(io_get_rings(ctx)->cq.head) == READ_ONCE(io_get_rings(ctx)->cq.tail) ? ret : 0;
308320
}

io_uring/wait.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,15 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx);
2828

2929
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
3030
{
31-
return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head);
31+
struct io_rings *rings = io_get_rings(ctx);
32+
return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
3233
}
3334

3435
static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
3536
{
36-
return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head);
37+
struct io_rings *rings = io_get_rings(ctx);
38+
39+
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
3740
}
3841

3942
/*

0 commit comments

Comments
 (0)