Skip to content

Commit b595ad7

Browse files
committed
Merge branch 'for-7.1/io_uring' into for-next
* for-7.1/io_uring: (54 commits) io_uring/timeout: use 'ctx' consistently io_uring/rw: clean up __io_read() obsolete comment and early returns io_uring/zcrx: use correct mmap off constants io_uring/zcrx: use dma_len for chunk size calculation io_uring/zcrx: don't clear not allocated niovs io_uring/zcrx: don't use mark0 for allocating xarray io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring() io_uring/zcrx: reject REG_NODEV with large rx_buf_size io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP io_uring/rsrc: use io_cache_free() to free node io_uring/zcrx: rename zcrx [un]register functions io_uring/zcrx: check ctrl op payload struct sizes io_uring/zcrx: cache fallback availability in zcrx ctx io_uring/zcrx: warn on a repeated area append io_uring/zcrx: consolidate dma syncing io_uring/zcrx: netmem array as refiling format io_uring/zcrx: warn on alloc with non-empty pp cache io_uring/zcrx: move count check into zcrx_get_free_niov io_uring/zcrx: use guards for locking io_uring/zcrx: add a struct for refill queue ...
2 parents 83b7880 + f847bf6 commit b595ad7

34 files changed

Lines changed: 1085 additions & 526 deletions

drivers/nvme/host/ioctl.c

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -786,10 +786,6 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
786786
struct nvme_ctrl *ctrl = ioucmd->file->private_data;
787787
int ret;
788788

789-
/* IOPOLL not supported yet */
790-
if (issue_flags & IO_URING_F_IOPOLL)
791-
return -EOPNOTSUPP;
792-
793789
ret = nvme_uring_cmd_checks(issue_flags);
794790
if (ret)
795791
return ret;

include/linux/io_uring_types.h

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
#include <linux/llist.h>
99
#include <uapi/linux/io_uring.h>
1010

11+
struct iou_loop_params;
12+
struct io_uring_bpf_ops;
13+
1114
enum {
1215
/*
1316
* A hint to not wake right away but delay until there are enough of
@@ -41,6 +44,8 @@ enum io_uring_cmd_flags {
4144
IO_URING_F_COMPAT = (1 << 12),
4245
};
4346

47+
struct iou_loop_params;
48+
4449
struct io_wq_work_node {
4550
struct io_wq_work_node *next;
4651
};
@@ -268,24 +273,30 @@ struct io_alloc_cache {
268273
unsigned int init_clear;
269274
};
270275

276+
enum {
277+
IO_RING_F_DRAIN_NEXT = BIT(0),
278+
IO_RING_F_OP_RESTRICTED = BIT(1),
279+
IO_RING_F_REG_RESTRICTED = BIT(2),
280+
IO_RING_F_OFF_TIMEOUT_USED = BIT(3),
281+
IO_RING_F_DRAIN_ACTIVE = BIT(4),
282+
IO_RING_F_HAS_EVFD = BIT(5),
283+
/* all CQEs should be posted only by the submitter task */
284+
IO_RING_F_TASK_COMPLETE = BIT(6),
285+
IO_RING_F_LOCKLESS_CQ = BIT(7),
286+
IO_RING_F_SYSCALL_IOPOLL = BIT(8),
287+
IO_RING_F_POLL_ACTIVATED = BIT(9),
288+
IO_RING_F_DRAIN_DISABLED = BIT(10),
289+
IO_RING_F_COMPAT = BIT(11),
290+
IO_RING_F_IOWQ_LIMITS_SET = BIT(12),
291+
};
292+
271293
struct io_ring_ctx {
272294
/* const or read-mostly hot data */
273295
struct {
296+
/* ring setup flags */
274297
unsigned int flags;
275-
unsigned int drain_next: 1;
276-
unsigned int op_restricted: 1;
277-
unsigned int reg_restricted: 1;
278-
unsigned int off_timeout_used: 1;
279-
unsigned int drain_active: 1;
280-
unsigned int has_evfd: 1;
281-
/* all CQEs should be posted only by the submitter task */
282-
unsigned int task_complete: 1;
283-
unsigned int lockless_cq: 1;
284-
unsigned int syscall_iopoll: 1;
285-
unsigned int poll_activated: 1;
286-
unsigned int drain_disabled: 1;
287-
unsigned int compat: 1;
288-
unsigned int iowq_limits_set : 1;
298+
/* internal state flags IO_RING_F_* flags , mostly read-only */
299+
unsigned int int_flags;
289300

290301
struct task_struct *submitter_task;
291302
struct io_rings *rings;
@@ -355,6 +366,9 @@ struct io_ring_ctx {
355366
struct io_alloc_cache rw_cache;
356367
struct io_alloc_cache cmd_cache;
357368

369+
int (*loop_step)(struct io_ring_ctx *ctx,
370+
struct iou_loop_params *);
371+
358372
/*
359373
* Any cancelable uring_cmd is added to this list in
360374
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
@@ -477,6 +491,8 @@ struct io_ring_ctx {
477491
DECLARE_HASHTABLE(napi_ht, 4);
478492
#endif
479493

494+
struct io_uring_bpf_ops *bpf_ops;
495+
480496
/*
481497
* Protection for resize vs mmap races - both the mmap and resize
482498
* side will need to grab this lock, to prevent either side from
@@ -545,6 +561,7 @@ enum {
545561
REQ_F_HAS_METADATA_BIT,
546562
REQ_F_IMPORT_BUFFER_BIT,
547563
REQ_F_SQE_COPIED_BIT,
564+
REQ_F_IOPOLL_BIT,
548565

549566
/* not a real bit, just to check we're not overflowing the space */
550567
__REQ_F_LAST_BIT,
@@ -638,6 +655,8 @@ enum {
638655
REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT),
639656
/* ->sqe_copy() has been called, if necessary */
640657
REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT),
658+
/* request must be iopolled to completion (set in ->issue()) */
659+
REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT),
641660
};
642661

643662
struct io_tw_req {

include/uapi/linux/io_uring.h

Lines changed: 7 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
#include <linux/fs.h>
1212
#include <linux/types.h>
13+
#include <linux/io_uring/zcrx.h>
14+
1315
/*
1416
* this file is shared with liburing and that has to autodetect
1517
* if linux/time_types.h is available or not, it can
@@ -341,6 +343,10 @@ enum io_uring_op {
341343

342344
/*
343345
* sqe->timeout_flags
346+
*
347+
* IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout
348+
* value in nanoseconds instead of
349+
* pointing to a timespec.
344350
*/
345351
#define IORING_TIMEOUT_ABS (1U << 0)
346352
#define IORING_TIMEOUT_UPDATE (1U << 1)
@@ -349,6 +355,7 @@ enum io_uring_op {
349355
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
350356
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
351357
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
358+
#define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7)
352359
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
353360
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
354361
/*
@@ -1050,100 +1057,6 @@ struct io_timespec {
10501057
__u64 tv_nsec;
10511058
};
10521059

1053-
/* Zero copy receive refill queue entry */
1054-
struct io_uring_zcrx_rqe {
1055-
__u64 off;
1056-
__u32 len;
1057-
__u32 __pad;
1058-
};
1059-
1060-
struct io_uring_zcrx_cqe {
1061-
__u64 off;
1062-
__u64 __pad;
1063-
};
1064-
1065-
/* The bit from which area id is encoded into offsets */
1066-
#define IORING_ZCRX_AREA_SHIFT 48
1067-
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
1068-
1069-
struct io_uring_zcrx_offsets {
1070-
__u32 head;
1071-
__u32 tail;
1072-
__u32 rqes;
1073-
__u32 __resv2;
1074-
__u64 __resv[2];
1075-
};
1076-
1077-
enum io_uring_zcrx_area_flags {
1078-
IORING_ZCRX_AREA_DMABUF = 1,
1079-
};
1080-
1081-
struct io_uring_zcrx_area_reg {
1082-
__u64 addr;
1083-
__u64 len;
1084-
__u64 rq_area_token;
1085-
__u32 flags;
1086-
__u32 dmabuf_fd;
1087-
__u64 __resv2[2];
1088-
};
1089-
1090-
enum zcrx_reg_flags {
1091-
ZCRX_REG_IMPORT = 1,
1092-
};
1093-
1094-
enum zcrx_features {
1095-
/*
1096-
* The user can ask for the desired rx page size by passing the
1097-
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
1098-
*/
1099-
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
1100-
};
1101-
1102-
/*
1103-
* Argument for IORING_REGISTER_ZCRX_IFQ
1104-
*/
1105-
struct io_uring_zcrx_ifq_reg {
1106-
__u32 if_idx;
1107-
__u32 if_rxq;
1108-
__u32 rq_entries;
1109-
__u32 flags;
1110-
1111-
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
1112-
__u64 region_ptr; /* struct io_uring_region_desc * */
1113-
1114-
struct io_uring_zcrx_offsets offsets;
1115-
__u32 zcrx_id;
1116-
__u32 rx_buf_len;
1117-
__u64 __resv[3];
1118-
};
1119-
1120-
enum zcrx_ctrl_op {
1121-
ZCRX_CTRL_FLUSH_RQ,
1122-
ZCRX_CTRL_EXPORT,
1123-
1124-
__ZCRX_CTRL_LAST,
1125-
};
1126-
1127-
struct zcrx_ctrl_flush_rq {
1128-
__u64 __resv[6];
1129-
};
1130-
1131-
struct zcrx_ctrl_export {
1132-
__u32 zcrx_fd;
1133-
__u32 __resv1[11];
1134-
};
1135-
1136-
struct zcrx_ctrl {
1137-
__u32 zcrx_id;
1138-
__u32 op; /* see enum zcrx_ctrl_op */
1139-
__u64 __resv[2];
1140-
1141-
union {
1142-
struct zcrx_ctrl_export zc_export;
1143-
struct zcrx_ctrl_flush_rq zc_flush;
1144-
};
1145-
};
1146-
11471060
#ifdef __cplusplus
11481061
}
11491062
#endif

include/uapi/linux/io_uring/zcrx.h

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
2+
/*
3+
* Header file for the io_uring zerocopy receive (zcrx) interface.
4+
*
5+
* Copyright (C) 2026 Pavel Begunkov
6+
* Copyright (C) 2026 David Wei
7+
* Copyright (C) Meta Platforms, Inc.
8+
*/
9+
#ifndef LINUX_IO_ZCRX_H
10+
#define LINUX_IO_ZCRX_H
11+
12+
#include <linux/types.h>
13+
14+
/* Zero copy receive refill queue entry */
15+
struct io_uring_zcrx_rqe {
16+
__u64 off;
17+
__u32 len;
18+
__u32 __pad;
19+
};
20+
21+
struct io_uring_zcrx_cqe {
22+
__u64 off;
23+
__u64 __pad;
24+
};
25+
26+
/* The bit from which area id is encoded into offsets */
27+
#define IORING_ZCRX_AREA_SHIFT 48
28+
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
29+
30+
struct io_uring_zcrx_offsets {
31+
__u32 head;
32+
__u32 tail;
33+
__u32 rqes;
34+
__u32 __resv2;
35+
__u64 __resv[2];
36+
};
37+
38+
enum io_uring_zcrx_area_flags {
39+
IORING_ZCRX_AREA_DMABUF = 1,
40+
};
41+
42+
struct io_uring_zcrx_area_reg {
43+
__u64 addr;
44+
__u64 len;
45+
__u64 rq_area_token;
46+
__u32 flags;
47+
__u32 dmabuf_fd;
48+
__u64 __resv2[2];
49+
};
50+
51+
enum zcrx_reg_flags {
52+
ZCRX_REG_IMPORT = 1,
53+
54+
/*
55+
* Register a zcrx instance without a net device. All data will be
56+
* copied. The refill queue entries might not be automatically
57+
* consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ.
58+
*/
59+
ZCRX_REG_NODEV = 2,
60+
};
61+
62+
enum zcrx_features {
63+
/*
64+
* The user can ask for the desired rx page size by passing the
65+
* value in struct io_uring_zcrx_ifq_reg::rx_buf_len.
66+
*/
67+
ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0,
68+
};
69+
70+
/*
71+
* Argument for IORING_REGISTER_ZCRX_IFQ
72+
*/
73+
struct io_uring_zcrx_ifq_reg {
74+
__u32 if_idx;
75+
__u32 if_rxq;
76+
__u32 rq_entries;
77+
__u32 flags;
78+
79+
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
80+
__u64 region_ptr; /* struct io_uring_region_desc * */
81+
82+
struct io_uring_zcrx_offsets offsets;
83+
__u32 zcrx_id;
84+
__u32 rx_buf_len;
85+
__u64 __resv[3];
86+
};
87+
88+
enum zcrx_ctrl_op {
89+
ZCRX_CTRL_FLUSH_RQ,
90+
ZCRX_CTRL_EXPORT,
91+
92+
__ZCRX_CTRL_LAST,
93+
};
94+
95+
struct zcrx_ctrl_flush_rq {
96+
__u64 __resv[6];
97+
};
98+
99+
struct zcrx_ctrl_export {
100+
__u32 zcrx_fd;
101+
__u32 __resv1[11];
102+
};
103+
104+
struct zcrx_ctrl {
105+
__u32 zcrx_id;
106+
__u32 op; /* see enum zcrx_ctrl_op */
107+
__u64 __resv[2];
108+
109+
union {
110+
struct zcrx_ctrl_export zc_export;
111+
struct zcrx_ctrl_flush_rq zc_flush;
112+
};
113+
};
114+
115+
#endif /* LINUX_IO_ZCRX_H */

io_uring/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,8 @@ config IO_URING_BPF
1414
def_bool y
1515
depends on BPF
1616
depends on NET
17+
18+
config IO_URING_BPF_OPS
19+
def_bool y
20+
depends on IO_URING
21+
depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF

io_uring/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
1414
advise.o openclose.o statx.o timeout.o \
1515
cancel.o waitid.o register.o \
1616
truncate.o memmap.o alloc_cache.o \
17-
query.o
17+
query.o loop.o
1818

1919
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
2020
obj-$(CONFIG_IO_WQ) += io-wq.o
@@ -25,3 +25,4 @@ obj-$(CONFIG_NET) += net.o cmd_net.o
2525
obj-$(CONFIG_PROC_FS) += fdinfo.o
2626
obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
2727
obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
28+
obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o

0 commit comments

Comments
 (0)