Skip to content

Commit 18755b8

Browse files
committed
svcrdma: Use contiguous pages for RDMA Read sink buffers
svc_rdma_build_read_segment() constructs RDMA Read sink buffers by consuming pages one-at-a-time from rq_pages[] and building one bvec per page. A 64KB NFS READ payload produces 16 separate bvecs, 16 DMA mappings, and potentially multiple RDMA Read WRs (on platforms with 4KB pages). A single higher-order allocation followed by split_page() yields physically contiguous memory while preserving per-page refcounts. A single bvec spanning the contiguous range causes rdma_rw_ctx_init_bvec() to take the rdma_rw_init_single_wr_bvec() fast path: one DMA mapping, one SGE, one WR. The split sub-pages replace the original rq_pages[] entries, so all downstream page tracking, completion handling, and xdr_buf assembly remain unchanged. Allocation uses __GFP_NORETRY | __GFP_NOWARN and falls back through decreasing orders. If even order-1 fails, the existing per-page path handles the segment. When nr_pages is not a power of two, get_order() rounds up and the allocation yields more pages than needed. The extra split pages replace existing rq_pages[] entries (freed via put_page() first), so there is no net increase in per- request page consumption. Successive segments reuse the same padding slots, preventing accumulation. The rq_maxpages guard rejects any allocation that would overrun the array, falling back to the per-page path. Under memory pressure, __GFP_NORETRY causes the higher- order allocation to fail without stalling. The contiguous path is attempted when the segment starts page-aligned (rc_pageoff == 0) and spans at least two pages. NFS WRITE segments carry application-modified byte ranges of arbitrary length, so the optimization is not restricted to power-of-two page counts. Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
1 parent 4e2866b commit 18755b8

1 file changed

Lines changed: 223 additions & 0 deletions

File tree

net/sunrpc/xprtrdma/svc_rdma_rw.c

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -754,6 +754,216 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
754754
return xdr->len;
755755
}
756756

757+
/*
758+
* Cap contiguous RDMA Read sink allocations at order-4.
759+
* Higher orders risk allocation failure under
760+
* __GFP_NORETRY, which would negate the benefit of the
761+
* contiguous fast path.
762+
*/
763+
#define SVC_RDMA_CONTIG_MAX_ORDER 4
764+
765+
/**
766+
* svc_rdma_alloc_read_pages - Allocate physically contiguous pages
767+
* @nr_pages: number of pages needed
768+
* @order: on success, set to the allocation order
769+
*
770+
* Attempts a higher-order allocation, falling back to smaller orders.
771+
* The returned pages are split immediately so each sub-page has its
772+
* own refcount and can be freed independently.
773+
*
774+
* Returns a pointer to the first page on success, or NULL if even
775+
* order-1 allocation fails.
776+
*/
777+
static struct page *
778+
svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order)
779+
{
780+
unsigned int o;
781+
struct page *page;
782+
783+
o = min(get_order(nr_pages << PAGE_SHIFT),
784+
SVC_RDMA_CONTIG_MAX_ORDER);
785+
786+
while (o >= 1) {
787+
page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN,
788+
o);
789+
if (page) {
790+
split_page(page, o);
791+
*order = o;
792+
return page;
793+
}
794+
o--;
795+
}
796+
return NULL;
797+
}
798+
799+
/*
800+
* svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation
801+
* @rqstp: RPC transaction context
802+
* @head: context for ongoing I/O
803+
* @bv: bvec entry to fill
804+
* @pages_left: number of data pages remaining in the segment
805+
* @len_left: bytes remaining in the segment
806+
*
807+
* On success, fills @bv with a bvec spanning the contiguous range and
808+
* advances rc_curpage/rc_page_count. Returns the byte length covered,
809+
* or zero if the allocation failed or would overrun rq_maxpages.
810+
*/
811+
static unsigned int
812+
svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp,
813+
struct svc_rdma_recv_ctxt *head,
814+
struct bio_vec *bv, unsigned int pages_left,
815+
unsigned int len_left)
816+
{
817+
unsigned int order, npages, chunk_pages, chunk_len, i;
818+
struct page *page;
819+
820+
page = svc_rdma_alloc_read_pages(pages_left, &order);
821+
if (!page)
822+
return 0;
823+
npages = 1 << order;
824+
825+
if (head->rc_curpage + npages > rqstp->rq_maxpages) {
826+
for (i = 0; i < npages; i++)
827+
__free_page(page + i);
828+
return 0;
829+
}
830+
831+
/*
832+
* Replace rq_pages[] entries with pages from the contiguous
833+
* allocation. If npages exceeds chunk_pages, the extra pages
834+
* stay in rq_pages[] for later reuse or normal rqst teardown.
835+
*/
836+
for (i = 0; i < npages; i++) {
837+
svc_rqst_page_release(rqstp,
838+
rqstp->rq_pages[head->rc_curpage + i]);
839+
rqstp->rq_pages[head->rc_curpage + i] = page + i;
840+
}
841+
842+
chunk_pages = min(npages, pages_left);
843+
chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left);
844+
bvec_set_page(bv, page, chunk_len, 0);
845+
head->rc_page_count += chunk_pages;
846+
head->rc_curpage += chunk_pages;
847+
return chunk_len;
848+
}
849+
850+
/*
851+
* svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array
852+
* @head: context for ongoing I/O
853+
* @ctxt: R/W context whose bvec array is being filled
854+
* @cur: page to add
855+
* @bvec_idx: pointer to current bvec index, not advanced on merge
856+
* @len_left: bytes remaining in the segment
857+
*
858+
* If @cur is physically contiguous with the preceding bvec, it is
859+
* merged by extending that bvec's length. Otherwise a new bvec
860+
* entry is created. Returns the byte length covered.
861+
*/
862+
static unsigned int
863+
svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head,
864+
struct svc_rdma_rw_ctxt *ctxt, struct page *cur,
865+
unsigned int *bvec_idx, unsigned int len_left)
866+
{
867+
unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left);
868+
869+
head->rc_page_count++;
870+
head->rc_curpage++;
871+
872+
if (*bvec_idx > 0) {
873+
struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1];
874+
875+
if (page_to_phys(prev->bv_page) + prev->bv_offset +
876+
prev->bv_len == page_to_phys(cur)) {
877+
prev->bv_len += chunk_len;
878+
return chunk_len;
879+
}
880+
}
881+
882+
bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0);
883+
(*bvec_idx)++;
884+
return chunk_len;
885+
}
886+
887+
/**
888+
* svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages
889+
* @rqstp: RPC transaction context
890+
* @head: context for ongoing I/O
891+
* @segment: co-ordinates of remote memory to be read
892+
*
893+
* Greedily allocates higher-order pages to cover the segment,
894+
* building one bvec per contiguous chunk. Each allocation is
895+
* split so sub-pages have independent refcounts. When a
896+
* higher-order allocation fails, remaining pages are covered
897+
* individually, merging adjacent pages into the preceding bvec
898+
* when they are physically contiguous. The split sub-pages
899+
* replace entries in rq_pages[] so downstream cleanup is
900+
* unchanged.
901+
*
902+
* Returns:
903+
* %0: the Read WR was constructed successfully
904+
* %-ENOMEM: allocation failed
905+
* %-EIO: a DMA mapping error occurred
906+
*/
907+
static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp,
908+
struct svc_rdma_recv_ctxt *head,
909+
const struct svc_rdma_segment *segment)
910+
{
911+
struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
912+
struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
913+
unsigned int nr_data_pages, bvec_idx;
914+
struct svc_rdma_rw_ctxt *ctxt;
915+
unsigned int len_left;
916+
int ret;
917+
918+
nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT;
919+
if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages)
920+
return -ENOMEM;
921+
922+
ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages);
923+
if (!ctxt)
924+
return -ENOMEM;
925+
926+
bvec_idx = 0;
927+
len_left = segment->rs_length;
928+
while (len_left) {
929+
unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT;
930+
unsigned int chunk_len = 0;
931+
932+
if (pages_left >= 2)
933+
chunk_len = svc_rdma_fill_contig_bvec(rqstp, head,
934+
&ctxt->rw_bvec[bvec_idx],
935+
pages_left, len_left);
936+
if (chunk_len) {
937+
bvec_idx++;
938+
} else {
939+
struct page *cur =
940+
rqstp->rq_pages[head->rc_curpage];
941+
chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur,
942+
&bvec_idx,
943+
len_left);
944+
}
945+
946+
len_left -= chunk_len;
947+
}
948+
949+
ctxt->rw_nents = bvec_idx;
950+
951+
head->rc_pageoff = offset_in_page(segment->rs_length);
952+
if (head->rc_pageoff)
953+
head->rc_curpage--;
954+
955+
ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
956+
segment->rs_handle, segment->rs_length,
957+
DMA_FROM_DEVICE);
958+
if (ret < 0)
959+
return -EIO;
960+
percpu_counter_inc(&svcrdma_stat_read);
961+
962+
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
963+
cc->cc_sqecount += ret;
964+
return 0;
965+
}
966+
757967
/**
758968
* svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
759969
* @rqstp: RPC transaction context
@@ -780,6 +990,14 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
780990
if (check_add_overflow(head->rc_pageoff, len, &total))
781991
return -EINVAL;
782992
nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT;
993+
994+
if (head->rc_pageoff == 0 && nr_bvec >= 2) {
995+
ret = svc_rdma_build_read_segment_contig(rqstp, head,
996+
segment);
997+
if (ret != -ENOMEM)
998+
return ret;
999+
}
1000+
7831001
ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
7841002
if (!ctxt)
7851003
return -ENOMEM;
@@ -1125,6 +1343,11 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp,
11251343
{
11261344
unsigned int i;
11271345

1346+
/*
1347+
* Move only pages containing RPC data into rc_pages[]. Pages
1348+
* from a contiguous allocation that were not used for the
1349+
* payload remain in rq_pages[] for subsequent reuse.
1350+
*/
11281351
for (i = 0; i < head->rc_page_count; i++) {
11291352
head->rc_pages[i] = rqstp->rq_pages[i];
11301353
rqstp->rq_pages[i] = NULL;

0 commit comments

Comments
 (0)