Skip to content

Commit ccc89b9

Browse files
committed
svcrdma: Add fair queuing for Send Queue access
When the Send Queue fills, multiple threads may wait for SQ slots. The previous implementation had no ordering guarantee, allowing starvation when one thread repeatedly acquires slots while others wait indefinitely. Introduce a ticket-based fair queuing system. Each waiter takes a ticket number and is served in FIFO order. This ensures forward progress for all waiters when SQ capacity is constrained. The implementation has two phases: 1. Fast path: attempt to reserve SQ slots without waiting 2. Slow path: take a ticket, wait for turn, then wait for slots The ticket system adds two atomic counters to the transport: - sc_sq_ticket_head: next ticket to issue - sc_sq_ticket_tail: ticket currently being served A dedicated wait queue (sc_sq_ticket_wait) handles ticket ordering, separate from sc_send_wait which handles SQ capacity. This separation ensures that send completions (the high-frequency wake source) wake only the current ticket holder rather than all queued waiters. Ticket handoff wakes only the ticket wait queue, and each ticket holder that exits via connection close propagates the wake to the next waiter in line. When a waiter successfully reserves slots, it advances the tail counter and wakes the next waiter. This creates an orderly handoff that prevents starvation while maintaining good throughput on the fast path when contention is low. Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
1 parent d7f3efd commit ccc89b9

4 files changed

Lines changed: 146 additions & 69 deletions

File tree

include/linux/sunrpc/svc_rdma.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ struct svcxprt_rdma {
8484

8585
atomic_t sc_sq_avail; /* SQEs ready to be consumed */
8686
unsigned int sc_sq_depth; /* Depth of SQ */
87+
atomic_t sc_sq_ticket_head; /* Next ticket to issue */
88+
atomic_t sc_sq_ticket_tail; /* Ticket currently serving */
89+
wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */
8790
__be32 sc_fc_credits; /* Forward credits */
8891
u32 sc_max_requests; /* Max requests */
8992
u32 sc_max_bc_requests;/* Backward credits */
@@ -306,6 +309,13 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
306309
struct svc_rdma_recv_ctxt *rctxt,
307310
int status);
308311
extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail);
312+
extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma,
313+
const struct rpc_rdma_cid *cid, int sqecount);
314+
extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma,
315+
const struct rpc_rdma_cid *cid,
316+
const struct ib_send_wr *bad_wr,
317+
const struct ib_send_wr *first_wr,
318+
int sqecount, int ret);
309319
extern int svc_rdma_sendto(struct svc_rqst *);
310320
extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
311321
unsigned int length);

net/sunrpc/xprtrdma/svc_rdma_rw.c

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -405,34 +405,17 @@ static int svc_rdma_post_chunk_ctxt(struct svcxprt_rdma *rdma,
405405
cqe = NULL;
406406
}
407407

408-
do {
409-
if (atomic_sub_return(cc->cc_sqecount,
410-
&rdma->sc_sq_avail) > 0) {
411-
cc->cc_posttime = ktime_get();
412-
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
413-
if (ret)
414-
break;
415-
return 0;
416-
}
417-
418-
percpu_counter_inc(&svcrdma_stat_sq_starve);
419-
trace_svcrdma_sq_full(rdma, &cc->cc_cid);
420-
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
421-
wait_event(rdma->sc_send_wait,
422-
atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount);
423-
trace_svcrdma_sq_retry(rdma, &cc->cc_cid);
424-
} while (1);
425-
426-
trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret);
427-
svc_xprt_deferred_close(&rdma->sc_xprt);
428-
429-
/* If even one was posted, there will be a completion. */
430-
if (bad_wr != first_wr)
431-
return 0;
408+
ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount);
409+
if (ret < 0)
410+
return ret;
432411

433-
atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
434-
wake_up(&rdma->sc_send_wait);
435-
return -ENOTCONN;
412+
cc->cc_posttime = ktime_get();
413+
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
414+
if (ret)
415+
return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr,
416+
first_wr, cc->cc_sqecount,
417+
ret);
418+
return 0;
436419
}
437420

438421
/* Build a bvec that covers one kvec in an xdr_buf.

net/sunrpc/xprtrdma/svc_rdma_sendto.c

Lines changed: 121 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,117 @@ void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail)
294294
wake_up(&rdma->sc_send_wait);
295295
}
296296

297+
/**
298+
* svc_rdma_sq_wait - Wait for SQ slots using fair queuing
299+
* @rdma: controlling transport
300+
* @cid: completion ID for tracing
301+
* @sqecount: number of SQ entries needed
302+
*
303+
* A ticket-based system ensures fair ordering when multiple threads
304+
* wait for Send Queue capacity. Each waiter takes a ticket and is
305+
* served in order, preventing starvation.
306+
*
307+
* Protocol invariant: every ticket holder must increment
308+
* sc_sq_ticket_tail exactly once, whether the reservation
309+
* succeeds or the connection closes. Failing to advance the
310+
* tail stalls all subsequent waiters.
311+
*
312+
* The ticket counters are signed 32-bit atomics. After
313+
* wrapping through INT_MAX, the equality check
314+
* (tail == ticket) remains correct because both counters
315+
* advance monotonically and the comparison uses exact
316+
* equality rather than relational operators.
317+
*
318+
* Return values:
319+
* %0: SQ slots were reserved successfully
320+
* %-ENOTCONN: The connection was lost
321+
*/
322+
int svc_rdma_sq_wait(struct svcxprt_rdma *rdma,
323+
const struct rpc_rdma_cid *cid, int sqecount)
324+
{
325+
int ticket;
326+
327+
/* Fast path: try to reserve SQ slots without waiting.
328+
*
329+
* A failed reservation temporarily understates sc_sq_avail
330+
* until the compensating atomic_add restores it. A Send
331+
* completion arriving in that window sees a lower count
332+
* than reality, but the value self-corrects once the add
333+
* completes. No ordering guarantee is needed here because
334+
* the slow path serializes all contended waiters.
335+
*/
336+
if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0))
337+
return 0;
338+
atomic_add(sqecount, &rdma->sc_sq_avail);
339+
340+
/* Slow path: take a ticket and wait in line */
341+
ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head);
342+
343+
percpu_counter_inc(&svcrdma_stat_sq_starve);
344+
trace_svcrdma_sq_full(rdma, cid);
345+
346+
/* Wait until all earlier tickets have been served */
347+
wait_event(rdma->sc_sq_ticket_wait,
348+
test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) ||
349+
atomic_read(&rdma->sc_sq_ticket_tail) == ticket);
350+
if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
351+
goto out_close;
352+
353+
/* It's our turn. Wait for enough SQ slots to be available. */
354+
while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
355+
atomic_add(sqecount, &rdma->sc_sq_avail);
356+
357+
wait_event(rdma->sc_send_wait,
358+
test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) ||
359+
atomic_read(&rdma->sc_sq_avail) >= sqecount);
360+
if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags))
361+
goto out_close;
362+
}
363+
364+
/* Slots reserved successfully. Let the next waiter proceed. */
365+
atomic_inc(&rdma->sc_sq_ticket_tail);
366+
wake_up(&rdma->sc_sq_ticket_wait);
367+
trace_svcrdma_sq_retry(rdma, cid);
368+
return 0;
369+
370+
out_close:
371+
atomic_inc(&rdma->sc_sq_ticket_tail);
372+
wake_up(&rdma->sc_sq_ticket_wait);
373+
return -ENOTCONN;
374+
}
375+
376+
/**
377+
* svc_rdma_post_send_err - Handle ib_post_send failure
378+
* @rdma: controlling transport
379+
* @cid: completion ID for tracing
380+
* @bad_wr: first WR that was not posted
381+
* @first_wr: first WR in the chain
382+
* @sqecount: number of SQ entries that were reserved
383+
* @ret: error code from ib_post_send
384+
*
385+
* Return values:
386+
* %0: At least one WR was posted; a completion handles cleanup
387+
* %-ENOTCONN: No WRs were posted; SQ slots are released
388+
*/
389+
int svc_rdma_post_send_err(struct svcxprt_rdma *rdma,
390+
const struct rpc_rdma_cid *cid,
391+
const struct ib_send_wr *bad_wr,
392+
const struct ib_send_wr *first_wr,
393+
int sqecount, int ret)
394+
{
395+
trace_svcrdma_sq_post_err(rdma, cid, ret);
396+
svc_xprt_deferred_close(&rdma->sc_xprt);
397+
398+
/* If even one WR was posted, a Send completion will
399+
* return the reserved SQ slots.
400+
*/
401+
if (bad_wr != first_wr)
402+
return 0;
403+
404+
svc_rdma_wake_send_waiters(rdma, sqecount);
405+
return -ENOTCONN;
406+
}
407+
297408
/**
298409
* svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
299410
* @cq: Completion Queue context
@@ -336,11 +447,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
336447
* that these values remain available after the ib_post_send() call.
337448
* In some error flow cases, svc_rdma_wc_send() releases @ctxt.
338449
*
339-
* Note there is potential for starvation when the Send Queue is
340-
* full because there is no order to when waiting threads are
341-
* awoken. The transport is typically provisioned with a deep
342-
* enough Send Queue that SQ exhaustion should be a rare event.
343-
*
344450
* Return values:
345451
* %0: @ctxt's WR chain was posted successfully
346452
* %-ENOTCONN: The connection was lost
@@ -362,42 +468,16 @@ int svc_rdma_post_send(struct svcxprt_rdma *rdma,
362468
send_wr->sg_list[0].length,
363469
DMA_TO_DEVICE);
364470

365-
/* If the SQ is full, wait until an SQ entry is available */
366-
while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) {
367-
if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) {
368-
svc_rdma_wake_send_waiters(rdma, sqecount);
369-
370-
/* When the transport is torn down, assume
371-
* ib_drain_sq() will trigger enough Send
372-
* completions to wake us. The XPT_CLOSE test
373-
* above should then cause the while loop to
374-
* exit.
375-
*/
376-
percpu_counter_inc(&svcrdma_stat_sq_starve);
377-
trace_svcrdma_sq_full(rdma, &cid);
378-
wait_event(rdma->sc_send_wait,
379-
atomic_read(&rdma->sc_sq_avail) > 0);
380-
trace_svcrdma_sq_retry(rdma, &cid);
381-
continue;
382-
}
383-
384-
trace_svcrdma_post_send(ctxt);
385-
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
386-
if (ret) {
387-
trace_svcrdma_sq_post_err(rdma, &cid, ret);
388-
svc_xprt_deferred_close(&rdma->sc_xprt);
389-
390-
/* If even one WR was posted, there will be a
391-
* Send completion that bumps sc_sq_avail.
392-
*/
393-
if (bad_wr == first_wr) {
394-
svc_rdma_wake_send_waiters(rdma, sqecount);
395-
break;
396-
}
397-
}
398-
return 0;
399-
}
400-
return -ENOTCONN;
471+
ret = svc_rdma_sq_wait(rdma, &cid, sqecount);
472+
if (ret < 0)
473+
return ret;
474+
475+
trace_svcrdma_post_send(ctxt);
476+
ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr);
477+
if (ret)
478+
return svc_rdma_post_send_err(rdma, &cid, bad_wr,
479+
first_wr, sqecount, ret);
480+
return 0;
401481
}
402482

403483
/**

net/sunrpc/xprtrdma/svc_rdma_transport.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
179179
init_llist_head(&cma_xprt->sc_recv_ctxts);
180180
init_llist_head(&cma_xprt->sc_rw_ctxts);
181181
init_waitqueue_head(&cma_xprt->sc_send_wait);
182+
init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait);
182183

183184
spin_lock_init(&cma_xprt->sc_lock);
184185
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
@@ -477,6 +478,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
477478
if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr)
478479
newxprt->sc_sq_depth = dev->attrs.max_qp_wr;
479480
atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);
481+
atomic_set(&newxprt->sc_sq_ticket_head, 0);
482+
atomic_set(&newxprt->sc_sq_ticket_tail, 0);
480483

481484
newxprt->sc_pd = ib_alloc_pd(dev, 0);
482485
if (IS_ERR(newxprt->sc_pd)) {
@@ -649,7 +652,8 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt)
649652
* If there are already waiters on the SQ,
650653
* return false.
651654
*/
652-
if (waitqueue_active(&rdma->sc_send_wait))
655+
if (waitqueue_active(&rdma->sc_send_wait) ||
656+
waitqueue_active(&rdma->sc_sq_ticket_wait))
653657
return 0;
654658

655659
/* Otherwise return true. */

0 commit comments

Comments
 (0)