Skip to content

Commit ff16884

Browse files
joannekoongaxboe
authored andcommitted
io_uring/kbuf: add support for kernel-managed buffer rings
Add support for kernel-managed buffer rings (kmbuf rings), which allow the kernel to allocate and manage the backing buffers for a buffer ring, rather than requiring the application to provide and manage them. Internally, the IOBL_KERNEL_MANAGED flag marks buffer lists as kernel-managed for appropriate handling in the I/O path. At the uapi level, kernel-managed buffer rings are created through the pbuf interface with the IOU_PBUF_RING_KERNEL_MANAGED flag set. The io_uring_buf_reg struct is modified to allow taking in a buf_size instead of a ring_addr. To create a kernel-managed buffer ring, the caller must set the IOU_PBUF_RING_MMAP flag as well to indicate that the kernel will allocate the memory for the ring. When the caller mmaps the ring, they will get back a virtual mapping to the buffer memory. Signed-off-by: Joanne Koong <joannelkoong@gmail.com> Link: https://patch.msgid.link/20260306003224.3620942-2-joannelkoong@gmail.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent f41b075 commit ff16884

5 files changed

Lines changed: 215 additions & 19 deletions

File tree

include/uapi/linux/io_uring.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -893,15 +893,29 @@ struct io_uring_buf_ring {
893893
* use of it will consume only as much as it needs. This
894894
* requires that both the kernel and application keep
895895
* track of where the current read/recv index is at.
896+
* IOU_PBUF_RING_KERNEL_MANAGED: If set, kernel allocates and manages the memory
897+
* for the ring and its buffers. The application must set
898+
* the buffer size through reg->buf_size and the size must
899+
* be page-aligned. When the application subsequently calls
900+
* mmap(2) with
901+
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT),
902+
* the virtual mapping returned is a contiguous mapping of
903+
* the buffers. If set, IOU_PBUF_RING_MMAP must be set as
904+
* well.
896905
*/
897906
enum io_uring_register_pbuf_ring_flags {
898907
IOU_PBUF_RING_MMAP = 1,
899908
IOU_PBUF_RING_INC = 2,
909+
IOU_PBUF_RING_KERNEL_MANAGED = 4,
900910
};
901911

902912
/* argument for IORING_(UN)REGISTER_PBUF_RING */
903913
struct io_uring_buf_reg {
904-
__u64 ring_addr;
914+
union {
915+
__u64 ring_addr;
916+
/* used if reg->flags & IOU_PBUF_RING_KERNEL_MANAGED */
917+
__u32 buf_size;
918+
};
905919
__u32 ring_entries;
906920
__u16 bgid;
907921
__u16 flags;

io_uring/kbuf.c

Lines changed: 81 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -436,10 +436,13 @@ static int io_remove_buffers_legacy(struct io_ring_ctx *ctx,
436436

437437
static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
438438
{
439-
if (bl->flags & IOBL_BUF_RING)
439+
if (bl->flags & IOBL_BUF_RING) {
440440
io_free_region(ctx->user, &bl->region);
441-
else
441+
if (bl->flags & IOBL_KERNEL_MANAGED)
442+
kfree(bl->buf_ring);
443+
} else {
442444
io_remove_buffers_legacy(ctx, bl, -1U);
445+
}
443446

444447
kfree(bl);
445448
}
@@ -605,14 +608,53 @@ int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags)
605608
return IOU_COMPLETE;
606609
}
607610

611+
static int io_setup_kmbuf_ring(struct io_ring_ctx *ctx,
612+
struct io_buffer_list *bl,
613+
const struct io_uring_buf_reg *reg)
614+
{
615+
struct io_uring_buf_ring *ring;
616+
unsigned long ring_size;
617+
void *buf_region;
618+
unsigned int i;
619+
int ret;
620+
621+
/* allocate pages for the ring structure */
622+
ring_size = flex_array_size(ring, bufs, reg->ring_entries);
623+
ring = kzalloc(ring_size, GFP_KERNEL_ACCOUNT);
624+
if (!ring)
625+
return -ENOMEM;
626+
627+
ret = io_create_region_multi_buf(ctx, &bl->region, reg->ring_entries,
628+
reg->buf_size);
629+
if (ret) {
630+
kfree(ring);
631+
return ret;
632+
}
633+
634+
/* initialize ring buf entries to point to the buffers */
635+
buf_region = bl->region.ptr;
636+
for (i = 0; i < reg->ring_entries; i++) {
637+
struct io_uring_buf *buf = &ring->bufs[i];
638+
639+
buf->addr = (u64)(uintptr_t)buf_region;
640+
buf->len = reg->buf_size;
641+
buf->bid = i;
642+
643+
buf_region += reg->buf_size;
644+
}
645+
ring->tail = reg->ring_entries;
646+
647+
bl->buf_ring = ring;
648+
bl->flags |= IOBL_KERNEL_MANAGED;
649+
650+
return 0;
651+
}
652+
608653
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
609654
{
610655
struct io_uring_buf_reg reg;
611656
struct io_buffer_list *bl;
612-
struct io_uring_region_desc rd;
613657
struct io_uring_buf_ring *br;
614-
unsigned long mmap_offset;
615-
unsigned long ring_size;
616658
int ret;
617659

618660
lockdep_assert_held(&ctx->uring_lock);
@@ -621,14 +663,25 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
621663
return -EFAULT;
622664
if (!mem_is_zero(reg.resv, sizeof(reg.resv)))
623665
return -EINVAL;
624-
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
666+
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC |
667+
IOU_PBUF_RING_KERNEL_MANAGED))
625668
return -EINVAL;
626669
if (!is_power_of_2(reg.ring_entries))
627670
return -EINVAL;
628671
/* cannot disambiguate full vs empty due to head/tail size */
629672
if (reg.ring_entries >= 65536)
630673
return -EINVAL;
631674

675+
if (reg.flags & IOU_PBUF_RING_KERNEL_MANAGED) {
676+
if (!(reg.flags & IOU_PBUF_RING_MMAP))
677+
return -EINVAL;
678+
/* not yet supported */
679+
if (reg.flags & IOU_PBUF_RING_INC)
680+
return -EINVAL;
681+
if (!reg.buf_size || !PAGE_ALIGNED(reg.buf_size))
682+
return -EINVAL;
683+
}
684+
632685
bl = io_buffer_get_list(ctx, reg.bgid);
633686
if (bl) {
634687
/* if mapped buffer ring OR classic exists, don't allow */
@@ -641,19 +694,30 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
641694
if (!bl)
642695
return -ENOMEM;
643696

644-
mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
645-
ring_size = flex_array_size(br, bufs, reg.ring_entries);
697+
if (!(reg.flags & IOU_PBUF_RING_KERNEL_MANAGED)) {
698+
struct io_uring_region_desc rd;
699+
unsigned long mmap_offset;
700+
unsigned long ring_size;
701+
702+
mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
703+
ring_size = flex_array_size(br, bufs, reg.ring_entries);
646704

647-
memset(&rd, 0, sizeof(rd));
648-
rd.size = PAGE_ALIGN(ring_size);
649-
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
650-
rd.user_addr = reg.ring_addr;
651-
rd.flags |= IORING_MEM_REGION_TYPE_USER;
705+
memset(&rd, 0, sizeof(rd));
706+
rd.size = PAGE_ALIGN(ring_size);
707+
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
708+
rd.user_addr = reg.ring_addr;
709+
rd.flags |= IORING_MEM_REGION_TYPE_USER;
710+
}
711+
ret = io_create_region(ctx, &bl->region, &rd, mmap_offset);
712+
if (!ret)
713+
bl->buf_ring = io_region_get_ptr(&bl->region);
714+
} else {
715+
ret = io_setup_kmbuf_ring(ctx, bl, &reg);
652716
}
653-
ret = io_create_region(ctx, &bl->region, &rd, mmap_offset);
654717
if (ret)
655718
goto fail;
656-
br = io_region_get_ptr(&bl->region);
719+
720+
br = bl->buf_ring;
657721

658722
#ifdef SHM_COLOUR
659723
/*
@@ -675,14 +739,15 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
675739
bl->nr_entries = reg.ring_entries;
676740
bl->mask = reg.ring_entries - 1;
677741
bl->flags |= IOBL_BUF_RING;
678-
bl->buf_ring = br;
679742
if (reg.flags & IOU_PBUF_RING_INC)
680743
bl->flags |= IOBL_INC;
681744
ret = io_buffer_add_list(ctx, bl, reg.bgid);
682745
if (!ret)
683746
return 0;
684747
fail:
685748
io_free_region(ctx->user, &bl->region);
749+
if (bl->flags & IOBL_KERNEL_MANAGED)
750+
kfree(bl->buf_ring);
686751
kfree(bl);
687752
return ret;
688753
}

io_uring/kbuf.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@
77

88
enum {
99
/* ring mapped provided buffers */
10-
IOBL_BUF_RING = 1,
10+
IOBL_BUF_RING = 1,
1111
/* buffers are consumed incrementally rather than always fully */
12-
IOBL_INC = 2,
12+
IOBL_INC = 2,
13+
/* buffers are kernel managed */
14+
IOBL_KERNEL_MANAGED = 4,
1315
};
1416

1517
struct io_buffer_list {

io_uring/memmap.c

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,28 @@
1515
#include "rsrc.h"
1616
#include "zcrx.h"
1717

18+
static void release_multi_buf_pages(struct page **pages, unsigned long nr_pages)
19+
{
20+
struct page *page;
21+
unsigned int nr, i = 0;
22+
23+
while (nr_pages) {
24+
page = pages[i];
25+
26+
if (!page || WARN_ON_ONCE(page != compound_head(page)))
27+
return;
28+
29+
nr = compound_nr(page);
30+
put_page(page);
31+
32+
if (WARN_ON_ONCE(nr > nr_pages))
33+
return;
34+
35+
i += nr;
36+
nr_pages -= nr;
37+
}
38+
}
39+
1840
static bool io_mem_alloc_compound(struct page **pages, int nr_pages,
1941
size_t size, gfp_t gfp)
2042
{
@@ -86,6 +108,8 @@ enum {
86108
IO_REGION_F_USER_PROVIDED = 2,
87109
/* only the first page in the array is ref'ed */
88110
IO_REGION_F_SINGLE_REF = 4,
111+
/* pages in the array belong to multiple discrete allocations */
112+
IO_REGION_F_MULTI_BUF = 8,
89113
};
90114

91115
void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
@@ -98,6 +122,8 @@ void io_free_region(struct user_struct *user, struct io_mapped_region *mr)
98122

99123
if (mr->flags & IO_REGION_F_USER_PROVIDED)
100124
unpin_user_pages(mr->pages, nr_refs);
125+
else if (mr->flags & IO_REGION_F_MULTI_BUF)
126+
release_multi_buf_pages(mr->pages, nr_refs);
101127
else
102128
release_pages(mr->pages, nr_refs);
103129

@@ -149,6 +175,54 @@ static int io_region_pin_pages(struct io_mapped_region *mr,
149175
return 0;
150176
}
151177

178+
static int io_region_allocate_pages_multi_buf(struct io_mapped_region *mr,
179+
unsigned int nr_bufs,
180+
unsigned int buf_size)
181+
{
182+
gfp_t gfp = GFP_USER | __GFP_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
183+
struct page **pages, **cur_pages;
184+
unsigned int nr_allocated;
185+
unsigned int buf_pages;
186+
unsigned int i;
187+
188+
if (!PAGE_ALIGNED(buf_size))
189+
return -EINVAL;
190+
191+
buf_pages = buf_size >> PAGE_SHIFT;
192+
193+
pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp);
194+
if (!pages)
195+
return -ENOMEM;
196+
197+
cur_pages = pages;
198+
199+
for (i = 0; i < nr_bufs; i++) {
200+
if (io_mem_alloc_compound(cur_pages, buf_pages, buf_size,
201+
gfp)) {
202+
cur_pages += buf_pages;
203+
continue;
204+
}
205+
206+
nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE,
207+
buf_pages, cur_pages);
208+
if (nr_allocated != buf_pages) {
209+
unsigned int total =
210+
(cur_pages - pages) + nr_allocated;
211+
212+
release_multi_buf_pages(pages, total);
213+
kvfree(pages);
214+
return -ENOMEM;
215+
}
216+
217+
cur_pages += buf_pages;
218+
}
219+
220+
mr->flags |= IO_REGION_F_MULTI_BUF;
221+
mr->pages = pages;
222+
223+
return 0;
224+
}
225+
152226
static int io_region_allocate_pages(struct io_mapped_region *mr,
153227
struct io_uring_region_desc *reg,
154228
unsigned long mmap_offset)
@@ -181,6 +255,43 @@ static int io_region_allocate_pages(struct io_mapped_region *mr,
181255
return 0;
182256
}
183257

258+
int io_create_region_multi_buf(struct io_ring_ctx *ctx,
259+
struct io_mapped_region *mr,
260+
unsigned int nr_bufs, unsigned int buf_size)
261+
{
262+
unsigned int nr_pages;
263+
int ret;
264+
265+
if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages))
266+
return -EFAULT;
267+
268+
if (WARN_ON_ONCE(!nr_bufs || !buf_size || !PAGE_ALIGNED(buf_size)))
269+
return -EINVAL;
270+
271+
if (check_mul_overflow(buf_size >> PAGE_SHIFT, nr_bufs, &nr_pages))
272+
return -EINVAL;
273+
274+
if (ctx->user) {
275+
ret = __io_account_mem(ctx->user, nr_pages);
276+
if (ret)
277+
return ret;
278+
}
279+
mr->nr_pages = nr_pages;
280+
281+
ret = io_region_allocate_pages_multi_buf(mr, nr_bufs, buf_size);
282+
if (ret)
283+
goto out_free;
284+
285+
ret = io_region_init_ptr(mr);
286+
if (ret)
287+
goto out_free;
288+
289+
return 0;
290+
out_free:
291+
io_free_region(ctx->user, mr);
292+
return ret;
293+
}
294+
184295
int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
185296
struct io_uring_region_desc *reg,
186297
unsigned long mmap_offset)

io_uring/memmap.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
2222
struct io_uring_region_desc *reg,
2323
unsigned long mmap_offset);
2424

25+
int io_create_region_multi_buf(struct io_ring_ctx *ctx,
26+
struct io_mapped_region *mr,
27+
unsigned int nr_bufs, unsigned int buf_size);
28+
2529
static inline void *io_region_get_ptr(struct io_mapped_region *mr)
2630
{
2731
return mr->ptr;

0 commit comments

Comments
 (0)