4646#include <linux/kref.h>
4747#include <linux/kfifo.h>
4848#include <linux/blk-integrity.h>
49+ #include <linux/maple_tree.h>
50+ #include <linux/xarray.h>
4951#include <uapi/linux/fs.h>
5052#include <uapi/linux/ublk_cmd.h>
5153
5860#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
5961#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
6062#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
63+ #define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
64+ #define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
6165
6266#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
6367#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -289,6 +293,20 @@ struct ublk_queue {
289293 struct ublk_io ios [] __counted_by (q_depth );
290294};
291295
296+ /* Per-registered shared memory buffer */
297+ struct ublk_buf {
298+ struct page * * pages ;
299+ unsigned int nr_pages ;
300+ };
301+
302+ /* Maple tree value: maps a PFN range to buffer location */
303+ struct ublk_buf_range {
304+ unsigned long base_pfn ;
305+ unsigned short buf_index ;
306+ unsigned short flags ;
307+ unsigned int base_offset ; /* byte offset within buffer */
308+ };
309+
292310struct ublk_device {
293311 struct gendisk * ub_disk ;
294312
@@ -323,6 +341,10 @@ struct ublk_device {
323341
324342 bool block_open ; /* protected by open_mutex */
325343
344+ /* shared memory zero copy */
345+ struct maple_tree buf_tree ;
346+ struct xarray bufs_xa ;
347+
326348 struct ublk_queue * queues [];
327349};
328350
@@ -334,6 +356,7 @@ struct ublk_params_header {
334356
335357static void ublk_io_release (void * priv );
336358static void ublk_stop_dev_unlocked (struct ublk_device * ub );
359+ static void ublk_buf_cleanup (struct ublk_device * ub );
337360static void ublk_abort_queue (struct ublk_device * ub , struct ublk_queue * ubq );
338361static inline struct request * __ublk_check_and_get_req (struct ublk_device * ub ,
339362 u16 q_id , u16 tag , struct ublk_io * io );
@@ -398,6 +421,16 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
398421 return ub -> dev_info .flags & UBLK_F_SUPPORT_ZERO_COPY ;
399422}
400423
424+ static inline bool ublk_support_shmem_zc (const struct ublk_queue * ubq )
425+ {
426+ return false;
427+ }
428+
429+ static inline bool ublk_dev_support_shmem_zc (const struct ublk_device * ub )
430+ {
431+ return false;
432+ }
433+
401434static inline bool ublk_support_auto_buf_reg (const struct ublk_queue * ubq )
402435{
403436 return ubq -> flags & UBLK_F_AUTO_BUF_REG ;
@@ -1460,6 +1493,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
14601493 iod -> op_flags = ublk_op | ublk_req_build_flags (req );
14611494 iod -> nr_sectors = blk_rq_sectors (req );
14621495 iod -> start_sector = blk_rq_pos (req );
1496+
14631497 iod -> addr = io -> buf .addr ;
14641498
14651499 return BLK_STS_OK ;
@@ -1665,6 +1699,7 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
16651699{
16661700 unsigned mapped_bytes = ublk_map_io (ubq , req , io );
16671701
1702+
16681703 /* partially mapped, update io descriptor */
16691704 if (unlikely (mapped_bytes != blk_rq_bytes (req ))) {
16701705 /*
@@ -4211,6 +4246,7 @@ static void ublk_cdev_rel(struct device *dev)
42114246{
42124247 struct ublk_device * ub = container_of (dev , struct ublk_device , cdev_dev );
42134248
4249+ ublk_buf_cleanup (ub );
42144250 blk_mq_free_tag_set (& ub -> tag_set );
42154251 ublk_deinit_queues (ub );
42164252 ublk_free_dev_number (ub );
@@ -4630,6 +4666,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
46304666 mutex_init (& ub -> mutex );
46314667 spin_lock_init (& ub -> lock );
46324668 mutex_init (& ub -> cancel_mutex );
4669+ mt_init (& ub -> buf_tree );
4670+ xa_init_flags (& ub -> bufs_xa , XA_FLAGS_ALLOC );
46334671 INIT_WORK (& ub -> partition_scan_work , ublk_partition_scan_work );
46344672
46354673 ret = ublk_alloc_dev_number (ub , header -> dev_id );
@@ -5173,6 +5211,255 @@ static int ublk_char_dev_permission(struct ublk_device *ub,
51735211 return err ;
51745212}
51755213
5214+ /*
5215+ * Drain inflight I/O and quiesce the queue. Freeze drains all inflight
5216+ * requests, quiesce_nowait marks the queue so no new requests dispatch,
5217+ * then unfreeze allows new submissions (which won't dispatch due to
5218+ * quiesce). This keeps freeze and ub->mutex non-nested.
5219+ */
5220+ static void ublk_quiesce_and_release (struct gendisk * disk )
5221+ {
5222+ unsigned int memflags ;
5223+
5224+ memflags = blk_mq_freeze_queue (disk -> queue );
5225+ blk_mq_quiesce_queue_nowait (disk -> queue );
5226+ blk_mq_unfreeze_queue (disk -> queue , memflags );
5227+ }
5228+
5229+ static void ublk_unquiesce_and_resume (struct gendisk * disk )
5230+ {
5231+ blk_mq_unquiesce_queue (disk -> queue );
5232+ }
5233+
5234+ /* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
5235+ static void ublk_buf_erase_ranges (struct ublk_device * ub ,
5236+ struct ublk_buf * ubuf ,
5237+ unsigned long nr_pages )
5238+ {
5239+ unsigned long i ;
5240+
5241+ for (i = 0 ; i < nr_pages ; ) {
5242+ unsigned long pfn = page_to_pfn (ubuf -> pages [i ]);
5243+ unsigned long start = i ;
5244+
5245+ while (i + 1 < nr_pages &&
5246+ page_to_pfn (ubuf -> pages [i + 1 ]) == pfn + (i - start ) + 1 )
5247+ i ++ ;
5248+ i ++ ;
5249+ kfree (mtree_erase (& ub -> buf_tree , pfn ));
5250+ }
5251+ }
5252+
5253+ static int __ublk_ctrl_reg_buf (struct ublk_device * ub ,
5254+ struct ublk_buf * ubuf , int index ,
5255+ unsigned short flags )
5256+ {
5257+ unsigned long nr_pages = ubuf -> nr_pages ;
5258+ unsigned long i ;
5259+ int ret ;
5260+
5261+ for (i = 0 ; i < nr_pages ; ) {
5262+ unsigned long pfn = page_to_pfn (ubuf -> pages [i ]);
5263+ unsigned long start = i ;
5264+ struct ublk_buf_range * range ;
5265+
5266+ /* Find run of consecutive PFNs */
5267+ while (i + 1 < nr_pages &&
5268+ page_to_pfn (ubuf -> pages [i + 1 ]) == pfn + (i - start ) + 1 )
5269+ i ++ ;
5270+ i ++ ; /* past the last page in this run */
5271+
5272+ range = kzalloc (sizeof (* range ), GFP_KERNEL );
5273+ if (!range ) {
5274+ ret = - ENOMEM ;
5275+ goto unwind ;
5276+ }
5277+ range -> buf_index = index ;
5278+ range -> flags = flags ;
5279+ range -> base_pfn = pfn ;
5280+ range -> base_offset = start << PAGE_SHIFT ;
5281+
5282+ ret = mtree_insert_range (& ub -> buf_tree , pfn ,
5283+ pfn + (i - start ) - 1 ,
5284+ range , GFP_KERNEL );
5285+ if (ret ) {
5286+ kfree (range );
5287+ goto unwind ;
5288+ }
5289+ }
5290+ return 0 ;
5291+
5292+ unwind :
5293+ ublk_buf_erase_ranges (ub , ubuf , i );
5294+ return ret ;
5295+ }
5296+
5297+ /*
5298+ * Register a shared memory buffer for zero-copy I/O.
5299+ * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
5300+ * internally. Returns buffer index (>= 0) on success.
5301+ */
5302+ static int ublk_ctrl_reg_buf (struct ublk_device * ub ,
5303+ struct ublksrv_ctrl_cmd * header )
5304+ {
5305+ void __user * argp = (void __user * )(unsigned long )header -> addr ;
5306+ struct ublk_shmem_buf_reg buf_reg ;
5307+ unsigned long addr , size , nr_pages ;
5308+ unsigned int gup_flags ;
5309+ struct gendisk * disk ;
5310+ struct ublk_buf * ubuf ;
5311+ long pinned ;
5312+ u32 index ;
5313+ int ret ;
5314+
5315+ if (!ublk_dev_support_shmem_zc (ub ))
5316+ return - EOPNOTSUPP ;
5317+
5318+ memset (& buf_reg , 0 , sizeof (buf_reg ));
5319+ if (copy_from_user (& buf_reg , argp ,
5320+ min_t (size_t , header -> len , sizeof (buf_reg ))))
5321+ return - EFAULT ;
5322+
5323+ if (buf_reg .flags & ~UBLK_SHMEM_BUF_READ_ONLY )
5324+ return - EINVAL ;
5325+
5326+ addr = buf_reg .addr ;
5327+ size = buf_reg .len ;
5328+ nr_pages = size >> PAGE_SHIFT ;
5329+
5330+ if (!size || !PAGE_ALIGNED (size ) || !PAGE_ALIGNED (addr ))
5331+ return - EINVAL ;
5332+
5333+ disk = ublk_get_disk (ub );
5334+ if (!disk )
5335+ return - ENODEV ;
5336+
5337+ /* Pin pages before quiescing (may sleep) */
5338+ ubuf = kzalloc (sizeof (* ubuf ), GFP_KERNEL );
5339+ if (!ubuf ) {
5340+ ret = - ENOMEM ;
5341+ goto put_disk ;
5342+ }
5343+
5344+ ubuf -> pages = kvmalloc_array (nr_pages , sizeof (* ubuf -> pages ),
5345+ GFP_KERNEL );
5346+ if (!ubuf -> pages ) {
5347+ ret = - ENOMEM ;
5348+ goto err_free ;
5349+ }
5350+
5351+ gup_flags = FOLL_LONGTERM ;
5352+ if (!(buf_reg .flags & UBLK_SHMEM_BUF_READ_ONLY ))
5353+ gup_flags |= FOLL_WRITE ;
5354+
5355+ pinned = pin_user_pages_fast (addr , nr_pages , gup_flags , ubuf -> pages );
5356+ if (pinned < 0 ) {
5357+ ret = pinned ;
5358+ goto err_free_pages ;
5359+ }
5360+ if (pinned != nr_pages ) {
5361+ ret = - EFAULT ;
5362+ goto err_unpin ;
5363+ }
5364+ ubuf -> nr_pages = nr_pages ;
5365+
5366+ /*
5367+ * Drain inflight I/O and quiesce the queue so no new requests
5368+ * are dispatched while we modify the maple tree. Keep freeze
5369+ * and mutex non-nested to avoid lock dependency.
5370+ */
5371+ ublk_quiesce_and_release (disk );
5372+
5373+ mutex_lock (& ub -> mutex );
5374+
5375+ ret = xa_alloc (& ub -> bufs_xa , & index , ubuf , xa_limit_16b , GFP_KERNEL );
5376+ if (ret )
5377+ goto err_unlock ;
5378+
5379+ ret = __ublk_ctrl_reg_buf (ub , ubuf , index , buf_reg .flags );
5380+ if (ret ) {
5381+ xa_erase (& ub -> bufs_xa , index );
5382+ goto err_unlock ;
5383+ }
5384+
5385+ mutex_unlock (& ub -> mutex );
5386+
5387+ ublk_unquiesce_and_resume (disk );
5388+ ublk_put_disk (disk );
5389+ return index ;
5390+
5391+ err_unlock :
5392+ mutex_unlock (& ub -> mutex );
5393+ ublk_unquiesce_and_resume (disk );
5394+ err_unpin :
5395+ unpin_user_pages (ubuf -> pages , pinned );
5396+ err_free_pages :
5397+ kvfree (ubuf -> pages );
5398+ err_free :
5399+ kfree (ubuf );
5400+ put_disk :
5401+ ublk_put_disk (disk );
5402+ return ret ;
5403+ }
5404+
5405+ static void __ublk_ctrl_unreg_buf (struct ublk_device * ub ,
5406+ struct ublk_buf * ubuf )
5407+ {
5408+ ublk_buf_erase_ranges (ub , ubuf , ubuf -> nr_pages );
5409+ unpin_user_pages (ubuf -> pages , ubuf -> nr_pages );
5410+ kvfree (ubuf -> pages );
5411+ kfree (ubuf );
5412+ }
5413+
5414+ static int ublk_ctrl_unreg_buf (struct ublk_device * ub ,
5415+ struct ublksrv_ctrl_cmd * header )
5416+ {
5417+ int index = (int )header -> data [0 ];
5418+ struct gendisk * disk ;
5419+ struct ublk_buf * ubuf ;
5420+
5421+ if (!ublk_dev_support_shmem_zc (ub ))
5422+ return - EOPNOTSUPP ;
5423+
5424+ disk = ublk_get_disk (ub );
5425+ if (!disk )
5426+ return - ENODEV ;
5427+
5428+ /* Drain inflight I/O before modifying the maple tree */
5429+ ublk_quiesce_and_release (disk );
5430+
5431+ mutex_lock (& ub -> mutex );
5432+
5433+ ubuf = xa_erase (& ub -> bufs_xa , index );
5434+ if (!ubuf ) {
5435+ mutex_unlock (& ub -> mutex );
5436+ ublk_unquiesce_and_resume (disk );
5437+ ublk_put_disk (disk );
5438+ return - ENOENT ;
5439+ }
5440+
5441+ __ublk_ctrl_unreg_buf (ub , ubuf );
5442+
5443+ mutex_unlock (& ub -> mutex );
5444+
5445+ ublk_unquiesce_and_resume (disk );
5446+ ublk_put_disk (disk );
5447+ return 0 ;
5448+ }
5449+
5450+ static void ublk_buf_cleanup (struct ublk_device * ub )
5451+ {
5452+ struct ublk_buf * ubuf ;
5453+ unsigned long index ;
5454+
5455+ xa_for_each (& ub -> bufs_xa , index , ubuf )
5456+ __ublk_ctrl_unreg_buf (ub , ubuf );
5457+ xa_destroy (& ub -> bufs_xa );
5458+ mtree_destroy (& ub -> buf_tree );
5459+ }
5460+
5461+
5462+
51765463static int ublk_ctrl_uring_cmd_permission (struct ublk_device * ub ,
51775464 u32 cmd_op , struct ublksrv_ctrl_cmd * header )
51785465{
@@ -5230,6 +5517,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
52305517 case UBLK_CMD_UPDATE_SIZE :
52315518 case UBLK_CMD_QUIESCE_DEV :
52325519 case UBLK_CMD_TRY_STOP_DEV :
5520+ case UBLK_CMD_REG_BUF :
5521+ case UBLK_CMD_UNREG_BUF :
52335522 mask = MAY_READ | MAY_WRITE ;
52345523 break ;
52355524 default :
@@ -5355,6 +5644,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
53555644 case UBLK_CMD_TRY_STOP_DEV :
53565645 ret = ublk_ctrl_try_stop_dev (ub );
53575646 break ;
5647+ case UBLK_CMD_REG_BUF :
5648+ ret = ublk_ctrl_reg_buf (ub , & header );
5649+ break ;
5650+ case UBLK_CMD_UNREG_BUF :
5651+ ret = ublk_ctrl_unreg_buf (ub , & header );
5652+ break ;
53585653 default :
53595654 ret = - EOPNOTSUPP ;
53605655 break ;
0 commit comments