Skip to content

Commit 2fb0ded

Browse files
Ming Leiaxboe
authored andcommitted
ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands
Add control commands for registering and unregistering shared memory buffers for zero-copy I/O: - UBLK_U_CMD_REG_BUF (0x18): pins pages from userspace, inserts PFN ranges into a per-device maple tree for O(log n) lookup during I/O. Buffer pointers are tracked in a per-device xarray. Returns the assigned buffer index. - UBLK_U_CMD_UNREG_BUF (0x19): removes PFN entries and unpins pages. Queue freeze/unfreeze is handled internally so userspace need not quiesce the device during registration. Also adds: - UBLK_IO_F_SHMEM_ZC flag and addr encoding helpers in UAPI header (16-bit buffer index supporting up to 65536 buffers) - Data structures (ublk_buf, ublk_buf_range) and xarray/maple tree - __ublk_ctrl_reg_buf() helper for PFN insertion with error unwinding - __ublk_ctrl_unreg_buf() helper for cleanup reuse - ublk_support_shmem_zc() / ublk_dev_support_shmem_zc() stubs (returning false — feature not enabled yet) Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://patch.msgid.link/20260331153207.3635125-2-ming.lei@redhat.com [axboe: fixup ublk_buf_reg -> ublk_shmem_buf_reg errors, comments] Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent fa0cac9 commit 2fb0ded

2 files changed

Lines changed: 367 additions & 0 deletions

File tree

drivers/block/ublk_drv.c

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
#include <linux/kref.h>
4747
#include <linux/kfifo.h>
4848
#include <linux/blk-integrity.h>
49+
#include <linux/maple_tree.h>
50+
#include <linux/xarray.h>
4951
#include <uapi/linux/fs.h>
5052
#include <uapi/linux/ublk_cmd.h>
5153

@@ -58,6 +60,8 @@
5860
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
5961
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
6062
#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63+
#define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
64+
#define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
6165

6266
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
6367
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -289,6 +293,20 @@ struct ublk_queue {
289293
struct ublk_io ios[] __counted_by(q_depth);
290294
};
291295

296+
/* Per-registered shared memory buffer */
297+
struct ublk_buf {
298+
struct page **pages;
299+
unsigned int nr_pages;
300+
};
301+
302+
/* Maple tree value: maps a PFN range to buffer location */
303+
struct ublk_buf_range {
304+
unsigned long base_pfn;
305+
unsigned short buf_index;
306+
unsigned short flags;
307+
unsigned int base_offset; /* byte offset within buffer */
308+
};
309+
292310
struct ublk_device {
293311
struct gendisk *ub_disk;
294312

@@ -323,6 +341,10 @@ struct ublk_device {
323341

324342
bool block_open; /* protected by open_mutex */
325343

344+
/* shared memory zero copy */
345+
struct maple_tree buf_tree;
346+
struct xarray bufs_xa;
347+
326348
struct ublk_queue *queues[];
327349
};
328350

@@ -334,6 +356,7 @@ struct ublk_params_header {
334356

335357
static void ublk_io_release(void *priv);
336358
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
359+
static void ublk_buf_cleanup(struct ublk_device *ub);
337360
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
338361
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
339362
u16 q_id, u16 tag, struct ublk_io *io);
@@ -398,6 +421,16 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
398421
return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
399422
}
400423

424+
static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
425+
{
426+
return false;
427+
}
428+
429+
static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
430+
{
431+
return false;
432+
}
433+
401434
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
402435
{
403436
return ubq->flags & UBLK_F_AUTO_BUF_REG;
@@ -1460,6 +1493,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
14601493
iod->op_flags = ublk_op | ublk_req_build_flags(req);
14611494
iod->nr_sectors = blk_rq_sectors(req);
14621495
iod->start_sector = blk_rq_pos(req);
1496+
14631497
iod->addr = io->buf.addr;
14641498

14651499
return BLK_STS_OK;
@@ -1665,6 +1699,7 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
16651699
{
16661700
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
16671701

1702+
16681703
/* partially mapped, update io descriptor */
16691704
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
16701705
/*
@@ -4211,6 +4246,7 @@ static void ublk_cdev_rel(struct device *dev)
42114246
{
42124247
struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
42134248

4249+
ublk_buf_cleanup(ub);
42144250
blk_mq_free_tag_set(&ub->tag_set);
42154251
ublk_deinit_queues(ub);
42164252
ublk_free_dev_number(ub);
@@ -4630,6 +4666,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
46304666
mutex_init(&ub->mutex);
46314667
spin_lock_init(&ub->lock);
46324668
mutex_init(&ub->cancel_mutex);
4669+
mt_init(&ub->buf_tree);
4670+
xa_init_flags(&ub->bufs_xa, XA_FLAGS_ALLOC);
46334671
INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
46344672

46354673
ret = ublk_alloc_dev_number(ub, header->dev_id);
@@ -5173,6 +5211,255 @@ static int ublk_char_dev_permission(struct ublk_device *ub,
51735211
return err;
51745212
}
51755213

5214+
/*
5215+
* Drain inflight I/O and quiesce the queue. Freeze drains all inflight
5216+
* requests, quiesce_nowait marks the queue so no new requests dispatch,
5217+
* then unfreeze allows new submissions (which won't dispatch due to
5218+
* quiesce). This keeps freeze and ub->mutex non-nested.
5219+
*/
5220+
static void ublk_quiesce_and_release(struct gendisk *disk)
5221+
{
5222+
unsigned int memflags;
5223+
5224+
memflags = blk_mq_freeze_queue(disk->queue);
5225+
blk_mq_quiesce_queue_nowait(disk->queue);
5226+
blk_mq_unfreeze_queue(disk->queue, memflags);
5227+
}
5228+
5229+
static void ublk_unquiesce_and_resume(struct gendisk *disk)
5230+
{
5231+
blk_mq_unquiesce_queue(disk->queue);
5232+
}
5233+
5234+
/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
5235+
static void ublk_buf_erase_ranges(struct ublk_device *ub,
5236+
struct ublk_buf *ubuf,
5237+
unsigned long nr_pages)
5238+
{
5239+
unsigned long i;
5240+
5241+
for (i = 0; i < nr_pages; ) {
5242+
unsigned long pfn = page_to_pfn(ubuf->pages[i]);
5243+
unsigned long start = i;
5244+
5245+
while (i + 1 < nr_pages &&
5246+
page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
5247+
i++;
5248+
i++;
5249+
kfree(mtree_erase(&ub->buf_tree, pfn));
5250+
}
5251+
}
5252+
5253+
static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
5254+
struct ublk_buf *ubuf, int index,
5255+
unsigned short flags)
5256+
{
5257+
unsigned long nr_pages = ubuf->nr_pages;
5258+
unsigned long i;
5259+
int ret;
5260+
5261+
for (i = 0; i < nr_pages; ) {
5262+
unsigned long pfn = page_to_pfn(ubuf->pages[i]);
5263+
unsigned long start = i;
5264+
struct ublk_buf_range *range;
5265+
5266+
/* Find run of consecutive PFNs */
5267+
while (i + 1 < nr_pages &&
5268+
page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
5269+
i++;
5270+
i++; /* past the last page in this run */
5271+
5272+
range = kzalloc(sizeof(*range), GFP_KERNEL);
5273+
if (!range) {
5274+
ret = -ENOMEM;
5275+
goto unwind;
5276+
}
5277+
range->buf_index = index;
5278+
range->flags = flags;
5279+
range->base_pfn = pfn;
5280+
range->base_offset = start << PAGE_SHIFT;
5281+
5282+
ret = mtree_insert_range(&ub->buf_tree, pfn,
5283+
pfn + (i - start) - 1,
5284+
range, GFP_KERNEL);
5285+
if (ret) {
5286+
kfree(range);
5287+
goto unwind;
5288+
}
5289+
}
5290+
return 0;
5291+
5292+
unwind:
5293+
ublk_buf_erase_ranges(ub, ubuf, i);
5294+
return ret;
5295+
}
5296+
5297+
/*
5298+
* Register a shared memory buffer for zero-copy I/O.
5299+
* Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5300+
* internally. Returns buffer index (>= 0) on success.
5301+
*/
5302+
static int ublk_ctrl_reg_buf(struct ublk_device *ub,
5303+
struct ublksrv_ctrl_cmd *header)
5304+
{
5305+
void __user *argp = (void __user *)(unsigned long)header->addr;
5306+
struct ublk_shmem_buf_reg buf_reg;
5307+
unsigned long addr, size, nr_pages;
5308+
unsigned int gup_flags;
5309+
struct gendisk *disk;
5310+
struct ublk_buf *ubuf;
5311+
long pinned;
5312+
u32 index;
5313+
int ret;
5314+
5315+
if (!ublk_dev_support_shmem_zc(ub))
5316+
return -EOPNOTSUPP;
5317+
5318+
memset(&buf_reg, 0, sizeof(buf_reg));
5319+
if (copy_from_user(&buf_reg, argp,
5320+
min_t(size_t, header->len, sizeof(buf_reg))))
5321+
return -EFAULT;
5322+
5323+
if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
5324+
return -EINVAL;
5325+
5326+
addr = buf_reg.addr;
5327+
size = buf_reg.len;
5328+
nr_pages = size >> PAGE_SHIFT;
5329+
5330+
if (!size || !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr))
5331+
return -EINVAL;
5332+
5333+
disk = ublk_get_disk(ub);
5334+
if (!disk)
5335+
return -ENODEV;
5336+
5337+
/* Pin pages before quiescing (may sleep) */
5338+
ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL);
5339+
if (!ubuf) {
5340+
ret = -ENOMEM;
5341+
goto put_disk;
5342+
}
5343+
5344+
ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
5345+
GFP_KERNEL);
5346+
if (!ubuf->pages) {
5347+
ret = -ENOMEM;
5348+
goto err_free;
5349+
}
5350+
5351+
gup_flags = FOLL_LONGTERM;
5352+
if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
5353+
gup_flags |= FOLL_WRITE;
5354+
5355+
pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
5356+
if (pinned < 0) {
5357+
ret = pinned;
5358+
goto err_free_pages;
5359+
}
5360+
if (pinned != nr_pages) {
5361+
ret = -EFAULT;
5362+
goto err_unpin;
5363+
}
5364+
ubuf->nr_pages = nr_pages;
5365+
5366+
/*
5367+
* Drain inflight I/O and quiesce the queue so no new requests
5368+
* are dispatched while we modify the maple tree. Keep freeze
5369+
* and mutex non-nested to avoid lock dependency.
5370+
*/
5371+
ublk_quiesce_and_release(disk);
5372+
5373+
mutex_lock(&ub->mutex);
5374+
5375+
ret = xa_alloc(&ub->bufs_xa, &index, ubuf, xa_limit_16b, GFP_KERNEL);
5376+
if (ret)
5377+
goto err_unlock;
5378+
5379+
ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
5380+
if (ret) {
5381+
xa_erase(&ub->bufs_xa, index);
5382+
goto err_unlock;
5383+
}
5384+
5385+
mutex_unlock(&ub->mutex);
5386+
5387+
ublk_unquiesce_and_resume(disk);
5388+
ublk_put_disk(disk);
5389+
return index;
5390+
5391+
err_unlock:
5392+
mutex_unlock(&ub->mutex);
5393+
ublk_unquiesce_and_resume(disk);
5394+
err_unpin:
5395+
unpin_user_pages(ubuf->pages, pinned);
5396+
err_free_pages:
5397+
kvfree(ubuf->pages);
5398+
err_free:
5399+
kfree(ubuf);
5400+
put_disk:
5401+
ublk_put_disk(disk);
5402+
return ret;
5403+
}
5404+
5405+
static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
5406+
struct ublk_buf *ubuf)
5407+
{
5408+
ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages);
5409+
unpin_user_pages(ubuf->pages, ubuf->nr_pages);
5410+
kvfree(ubuf->pages);
5411+
kfree(ubuf);
5412+
}
5413+
5414+
static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
5415+
struct ublksrv_ctrl_cmd *header)
5416+
{
5417+
int index = (int)header->data[0];
5418+
struct gendisk *disk;
5419+
struct ublk_buf *ubuf;
5420+
5421+
if (!ublk_dev_support_shmem_zc(ub))
5422+
return -EOPNOTSUPP;
5423+
5424+
disk = ublk_get_disk(ub);
5425+
if (!disk)
5426+
return -ENODEV;
5427+
5428+
/* Drain inflight I/O before modifying the maple tree */
5429+
ublk_quiesce_and_release(disk);
5430+
5431+
mutex_lock(&ub->mutex);
5432+
5433+
ubuf = xa_erase(&ub->bufs_xa, index);
5434+
if (!ubuf) {
5435+
mutex_unlock(&ub->mutex);
5436+
ublk_unquiesce_and_resume(disk);
5437+
ublk_put_disk(disk);
5438+
return -ENOENT;
5439+
}
5440+
5441+
__ublk_ctrl_unreg_buf(ub, ubuf);
5442+
5443+
mutex_unlock(&ub->mutex);
5444+
5445+
ublk_unquiesce_and_resume(disk);
5446+
ublk_put_disk(disk);
5447+
return 0;
5448+
}
5449+
5450+
static void ublk_buf_cleanup(struct ublk_device *ub)
5451+
{
5452+
struct ublk_buf *ubuf;
5453+
unsigned long index;
5454+
5455+
xa_for_each(&ub->bufs_xa, index, ubuf)
5456+
__ublk_ctrl_unreg_buf(ub, ubuf);
5457+
xa_destroy(&ub->bufs_xa);
5458+
mtree_destroy(&ub->buf_tree);
5459+
}
5460+
5461+
5462+
51765463
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
51775464
u32 cmd_op, struct ublksrv_ctrl_cmd *header)
51785465
{
@@ -5230,6 +5517,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
52305517
case UBLK_CMD_UPDATE_SIZE:
52315518
case UBLK_CMD_QUIESCE_DEV:
52325519
case UBLK_CMD_TRY_STOP_DEV:
5520+
case UBLK_CMD_REG_BUF:
5521+
case UBLK_CMD_UNREG_BUF:
52335522
mask = MAY_READ | MAY_WRITE;
52345523
break;
52355524
default:
@@ -5355,6 +5644,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
53555644
case UBLK_CMD_TRY_STOP_DEV:
53565645
ret = ublk_ctrl_try_stop_dev(ub);
53575646
break;
5647+
case UBLK_CMD_REG_BUF:
5648+
ret = ublk_ctrl_reg_buf(ub, &header);
5649+
break;
5650+
case UBLK_CMD_UNREG_BUF:
5651+
ret = ublk_ctrl_unreg_buf(ub, &header);
5652+
break;
53585653
default:
53595654
ret = -EOPNOTSUPP;
53605655
break;

0 commit comments

Comments
 (0)