Skip to content

Commit d16f060

Browse files
committed
svcrdma: Add Write chunk WRs to the RPC's Send WR chain
Previously, Write chunk RDMA Writes were posted via a separate ib_post_send() call with their own completion handler. Each Write chunk incurred a doorbell and generated a completion event. Link Write chunk WRs onto the RPC Reply's Send WR chain so that a single ib_post_send() call posts both the RDMA Writes and the Send WR. A single completion event signals that all operations have finished. This reduces both doorbell rate and completion rate, as well as eliminating the latency of a round-trip between the Write chunk completion and the subsequent Send WR posting. The lifecycle of Write chunk resources changes: previously, the svc_rdma_write_done() completion handler released Write chunk resources when RDMA Writes completed. With WR chaining, resources remain live until the Send completion. A new sc_write_info_list tracks Write chunk metadata attached to each Send context, and svc_rdma_write_chunk_release() frees these resources when the Send context is released. The svc_rdma_write_done() handler now handles only error cases. On success it returns immediately since the Send completion handles resource release. On failure (WR flush), it closes the connection to signal to the client that the RPC Reply is incomplete. Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
1 parent c553983 commit d16f060

3 files changed

Lines changed: 91 additions & 26 deletions

File tree

include/linux/sunrpc/svc_rdma.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ struct svc_rdma_recv_ctxt {
216216
*/
217217
struct svc_rdma_write_info {
218218
struct svcxprt_rdma *wi_rdma;
219+
struct list_head wi_list;
219220

220221
const struct svc_rdma_chunk *wi_chunk;
221222

@@ -244,7 +245,10 @@ struct svc_rdma_send_ctxt {
244245
struct ib_cqe sc_cqe;
245246
struct xdr_buf sc_hdrbuf;
246247
struct xdr_stream sc_stream;
248+
249+
struct list_head sc_write_info_list;
247250
struct svc_rdma_write_info sc_reply_info;
251+
248252
void *sc_xprt_buf;
249253
int sc_page_count;
250254
int sc_cur_sge_no;
@@ -277,11 +281,14 @@ extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma,
277281
extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma,
278282
struct svc_rdma_chunk_ctxt *cc,
279283
enum dma_data_direction dir);
284+
extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
285+
struct svc_rdma_send_ctxt *ctxt);
280286
extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma,
281287
struct svc_rdma_send_ctxt *ctxt);
282-
extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
283-
const struct svc_rdma_recv_ctxt *rctxt,
284-
const struct xdr_buf *xdr);
288+
extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
289+
const struct svc_rdma_recv_ctxt *rctxt,
290+
struct svc_rdma_send_ctxt *sctxt,
291+
const struct xdr_buf *xdr);
285292
extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
286293
const struct svc_rdma_pcl *write_pcl,
287294
const struct svc_rdma_pcl *reply_pcl,

net/sunrpc/xprtrdma/svc_rdma_rw.c

Lines changed: 72 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,28 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info)
251251
queue_work(svcrdma_wq, &info->wi_work);
252252
}
253253

254+
/**
255+
* svc_rdma_write_chunk_release - Release Write chunk I/O resources
256+
* @rdma: controlling transport
257+
* @ctxt: Send context that is being released
258+
*
259+
* Write chunk resources remain live until Send completion because
260+
* Write WRs are chained to the Send WR. This function releases all
261+
* write_info structures accumulated on @ctxt->sc_write_info_list.
262+
*/
263+
void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma,
264+
struct svc_rdma_send_ctxt *ctxt)
265+
{
266+
struct svc_rdma_write_info *info;
267+
268+
while (!list_empty(&ctxt->sc_write_info_list)) {
269+
info = list_first_entry(&ctxt->sc_write_info_list,
270+
struct svc_rdma_write_info, wi_list);
271+
list_del(&info->wi_list);
272+
svc_rdma_write_info_free(info);
273+
}
274+
}
275+
254276
/**
255277
* svc_rdma_reply_chunk_release - Release Reply chunk I/O resources
256278
* @rdma: controlling transport
@@ -307,26 +329,23 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
307329
struct ib_cqe *cqe = wc->wr_cqe;
308330
struct svc_rdma_chunk_ctxt *cc =
309331
container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
310-
struct svc_rdma_write_info *info =
311-
container_of(cc, struct svc_rdma_write_info, wi_cc);
312332

313333
switch (wc->status) {
314334
case IB_WC_SUCCESS:
315335
trace_svcrdma_wc_write(&cc->cc_cid);
316-
break;
336+
return;
317337
case IB_WC_WR_FLUSH_ERR:
318338
trace_svcrdma_wc_write_flush(wc, &cc->cc_cid);
319339
break;
320340
default:
321341
trace_svcrdma_wc_write_err(wc, &cc->cc_cid);
322342
}
323343

324-
svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount);
325-
326-
if (unlikely(wc->status != IB_WC_SUCCESS))
327-
svc_xprt_deferred_close(&rdma->sc_xprt);
328-
329-
svc_rdma_write_info_free(info);
344+
/* The RDMA Write has flushed, so the client won't get
345+
* some of the outgoing RPC message. Signal the loss
346+
* to the client by closing the connection.
347+
*/
348+
svc_xprt_deferred_close(&rdma->sc_xprt);
330349
}
331350

332351
/**
@@ -600,13 +619,27 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data)
600619
return xdr->len;
601620
}
602621

603-
static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
604-
const struct svc_rdma_chunk *chunk,
605-
const struct xdr_buf *xdr)
622+
/*
623+
* svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain
624+
*
625+
* Write WRs are prepended to the Send WR chain so that a single
626+
* ib_post_send() posts both RDMA Writes and the final Send. Only
627+
* the first WR in each chunk gets a CQE for error detection;
628+
* subsequent WRs complete without individual completion events.
629+
* The Send WR's signaled completion indicates all chained
630+
* operations have finished.
631+
*/
632+
static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma,
633+
struct svc_rdma_send_ctxt *sctxt,
634+
const struct svc_rdma_chunk *chunk,
635+
const struct xdr_buf *xdr)
606636
{
607637
struct svc_rdma_write_info *info;
608638
struct svc_rdma_chunk_ctxt *cc;
639+
struct ib_send_wr *first_wr;
609640
struct xdr_buf payload;
641+
struct list_head *pos;
642+
struct ib_cqe *cqe;
610643
int ret;
611644

612645
if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position,
@@ -622,10 +655,25 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
622655
if (ret != payload.len)
623656
goto out_err;
624657

625-
trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
626-
ret = svc_rdma_post_chunk_ctxt(rdma, cc);
627-
if (ret < 0)
658+
ret = -EINVAL;
659+
if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth))
628660
goto out_err;
661+
662+
first_wr = sctxt->sc_wr_chain;
663+
cqe = &cc->cc_cqe;
664+
list_for_each(pos, &cc->cc_rwctxts) {
665+
struct svc_rdma_rw_ctxt *rwc;
666+
667+
rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list);
668+
first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp,
669+
rdma->sc_port_num, cqe, first_wr);
670+
cqe = NULL;
671+
}
672+
sctxt->sc_wr_chain = first_wr;
673+
sctxt->sc_sqecount += cc->cc_sqecount;
674+
list_add(&info->wi_list, &sctxt->sc_write_info_list);
675+
676+
trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount);
629677
return 0;
630678

631679
out_err:
@@ -634,25 +682,27 @@ static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
634682
}
635683

636684
/**
637-
* svc_rdma_send_write_list - Send all chunks on the Write list
685+
* svc_rdma_prepare_write_list - Construct WR chain for sending Write list
638686
* @rdma: controlling RDMA transport
639687
* @rctxt: Write list provisioned by the client
688+
* @sctxt: Send WR resources
640689
* @xdr: xdr_buf containing an RPC Reply message
641690
*
642-
* Returns zero on success, or a negative errno if one or more
643-
* Write chunks could not be sent.
691+
* Returns zero on success, or a negative errno if WR chain
692+
* construction fails for one or more Write chunks.
644693
*/
645-
int svc_rdma_send_write_list(struct svcxprt_rdma *rdma,
646-
const struct svc_rdma_recv_ctxt *rctxt,
647-
const struct xdr_buf *xdr)
694+
int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma,
695+
const struct svc_rdma_recv_ctxt *rctxt,
696+
struct svc_rdma_send_ctxt *sctxt,
697+
const struct xdr_buf *xdr)
648698
{
649699
struct svc_rdma_chunk *chunk;
650700
int ret;
651701

652702
pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) {
653703
if (!chunk->ch_payload_length)
654704
break;
655-
ret = svc_rdma_send_write_chunk(rdma, chunk, xdr);
705+
ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr);
656706
if (ret < 0)
657707
return ret;
658708
}

net/sunrpc/xprtrdma/svc_rdma_sendto.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
150150
ctxt->sc_send_wr.sg_list = ctxt->sc_sges;
151151
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
152152
ctxt->sc_cqe.done = svc_rdma_wc_send;
153+
INIT_LIST_HEAD(&ctxt->sc_write_info_list);
153154
ctxt->sc_xprt_buf = buffer;
154155
xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
155156
rdma->sc_max_req_size);
@@ -237,6 +238,7 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma,
237238
struct ib_device *device = rdma->sc_cm_id->device;
238239
unsigned int i;
239240

241+
svc_rdma_write_chunk_release(rdma, ctxt);
240242
svc_rdma_reply_chunk_release(rdma, ctxt);
241243

242244
if (ctxt->sc_page_count)
@@ -1054,6 +1056,12 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
10541056
sctxt->sc_send_wr.num_sge = 1;
10551057
sctxt->sc_send_wr.opcode = IB_WR_SEND;
10561058
sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
1059+
1060+
/* Ensure only the error message is posted, not any previously
1061+
* prepared Write chunk WRs.
1062+
*/
1063+
sctxt->sc_wr_chain = &sctxt->sc_send_wr;
1064+
sctxt->sc_sqecount = 1;
10571065
if (svc_rdma_post_send(rdma, sctxt))
10581066
goto put_ctxt;
10591067
return;
@@ -1101,7 +1109,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
11011109
if (!p)
11021110
goto put_ctxt;
11031111

1104-
ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res);
1112+
ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res);
11051113
if (ret < 0)
11061114
goto put_ctxt;
11071115

0 commit comments

Comments
 (0)