Skip to content

Commit 166b476

Browse files
Ming Leiaxboe
authored andcommitted
selftests/ublk: add shared memory zero-copy support in kublk
Add infrastructure for UBLK_F_SHMEM_ZC shared memory zero-copy: - kublk.h: struct ublk_shmem_entry and table for tracking registered shared memory buffers - kublk.c: per-device unix socket listener that accepts memfd registrations from clients via SCM_RIGHTS fd passing. The listener mmaps the memfd and registers the VA range with the kernel for PFN matching. Also adds --shmem_zc command line option. - kublk.c: --htlb <path> option to open a pre-allocated hugetlbfs file, mmap it with MAP_SHARED|MAP_POPULATE, and register it with the kernel via ublk_ctrl_reg_buf(). Any process that mmaps the same hugetlbfs file shares the same physical pages, enabling zero-copy without socket-based fd passing. Signed-off-by: Ming Lei <ming.lei@redhat.com> Link: https://patch.msgid.link/20260331153207.3635125-6-ming.lei@redhat.com Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent 8a34e88 commit 166b476

2 files changed

Lines changed: 352 additions & 2 deletions

File tree

tools/testing/selftests/ublk/kublk.c

Lines changed: 338 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*/
55

66
#include <linux/fs.h>
7+
#include <sys/un.h>
78
#include "kublk.h"
89

910
#define MAX_NR_TGT_ARG 64
@@ -1092,13 +1093,312 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev,
10921093
}
10931094

10941095

1096+
/*
1097+
* Shared memory registration socket listener.
1098+
*
1099+
* The parent daemon context listens on a per-device unix socket at
1100+
* /run/ublk/ublkb<dev_id>.sock for shared memory registration requests
1101+
* from clients. Clients send a memfd via SCM_RIGHTS; the server
1102+
* registers it with the kernel, mmaps it, and returns the assigned index.
1103+
*/
1104+
#define UBLK_SHMEM_SOCK_DIR "/run/ublk"
1105+
1106+
/* defined in kublk.h, shared with file_backed.c (loop target) */
1107+
struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX];
1108+
int shmem_count;
1109+
1110+
static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len)
1111+
{
1112+
snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id);
1113+
}
1114+
1115+
static int ublk_shmem_sock_create(int dev_id)
1116+
{
1117+
struct sockaddr_un addr = { .sun_family = AF_UNIX };
1118+
char path[108];
1119+
int fd;
1120+
1121+
mkdir(UBLK_SHMEM_SOCK_DIR, 0755);
1122+
ublk_shmem_sock_path(dev_id, path, sizeof(path));
1123+
unlink(path);
1124+
1125+
fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0);
1126+
if (fd < 0)
1127+
return -1;
1128+
1129+
snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path);
1130+
if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
1131+
close(fd);
1132+
return -1;
1133+
}
1134+
1135+
listen(fd, 4);
1136+
ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path);
1137+
return fd;
1138+
}
1139+
1140+
static void ublk_shmem_sock_destroy(int dev_id, int sock_fd)
1141+
{
1142+
char path[108];
1143+
1144+
if (sock_fd >= 0)
1145+
close(sock_fd);
1146+
ublk_shmem_sock_path(dev_id, path, sizeof(path));
1147+
unlink(path);
1148+
}
1149+
1150+
/* Receive a memfd from a client via SCM_RIGHTS */
1151+
static int ublk_shmem_recv_fd(int client_fd)
1152+
{
1153+
char buf[1];
1154+
struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) };
1155+
union {
1156+
char cmsg_buf[CMSG_SPACE(sizeof(int))];
1157+
struct cmsghdr align;
1158+
} u;
1159+
struct msghdr msg = {
1160+
.msg_iov = &iov,
1161+
.msg_iovlen = 1,
1162+
.msg_control = u.cmsg_buf,
1163+
.msg_controllen = sizeof(u.cmsg_buf),
1164+
};
1165+
struct cmsghdr *cmsg;
1166+
1167+
if (recvmsg(client_fd, &msg, 0) <= 0)
1168+
return -1;
1169+
1170+
cmsg = CMSG_FIRSTHDR(&msg);
1171+
if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
1172+
cmsg->cmsg_type != SCM_RIGHTS)
1173+
return -1;
1174+
1175+
return *(int *)CMSG_DATA(cmsg);
1176+
}
1177+
1178+
/* Register a shared memory buffer: store fd, mmap it, return index */
1179+
static int ublk_shmem_register(int shmem_fd)
1180+
{
1181+
off_t size;
1182+
void *base;
1183+
int idx;
1184+
1185+
if (shmem_count >= UBLK_BUF_MAX)
1186+
return -1;
1187+
1188+
size = lseek(shmem_fd, 0, SEEK_END);
1189+
if (size <= 0)
1190+
return -1;
1191+
1192+
base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
1193+
shmem_fd, 0);
1194+
if (base == MAP_FAILED)
1195+
return -1;
1196+
1197+
idx = shmem_count++;
1198+
shmem_table[idx].fd = shmem_fd;
1199+
shmem_table[idx].mmap_base = base;
1200+
shmem_table[idx].size = size;
1201+
1202+
ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n",
1203+
idx, shmem_fd, (size_t)size);
1204+
return idx;
1205+
}
1206+
1207+
static void ublk_shmem_unregister_all(void)
1208+
{
1209+
int i;
1210+
1211+
for (i = 0; i < shmem_count; i++) {
1212+
if (shmem_table[i].mmap_base) {
1213+
munmap(shmem_table[i].mmap_base,
1214+
shmem_table[i].size);
1215+
close(shmem_table[i].fd);
1216+
shmem_table[i].mmap_base = NULL;
1217+
}
1218+
}
1219+
shmem_count = 0;
1220+
}
1221+
1222+
static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size)
1223+
{
1224+
struct ublk_shmem_buf_reg buf_reg = {
1225+
.addr = (unsigned long)addr,
1226+
.len = size,
1227+
};
1228+
struct ublk_ctrl_cmd_data data = {
1229+
.cmd_op = UBLK_U_CMD_REG_BUF,
1230+
.flags = CTRL_CMD_HAS_BUF,
1231+
.addr = (unsigned long)&buf_reg,
1232+
.len = sizeof(buf_reg),
1233+
};
1234+
1235+
return __ublk_ctrl_cmd(dev, &data);
1236+
}
1237+
1238+
/*
1239+
* Handle one client connection: receive memfd, mmap it, register
1240+
* the VA range with kernel, send back the assigned index.
1241+
*/
1242+
static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev)
1243+
{
1244+
int client_fd, memfd, idx, ret;
1245+
int32_t reply;
1246+
off_t size;
1247+
void *base;
1248+
1249+
client_fd = accept(sock_fd, NULL, NULL);
1250+
if (client_fd < 0)
1251+
return;
1252+
1253+
memfd = ublk_shmem_recv_fd(client_fd);
1254+
if (memfd < 0) {
1255+
reply = -1;
1256+
goto out;
1257+
}
1258+
1259+
/* mmap the memfd in server address space */
1260+
size = lseek(memfd, 0, SEEK_END);
1261+
if (size <= 0) {
1262+
reply = -1;
1263+
close(memfd);
1264+
goto out;
1265+
}
1266+
base = mmap(NULL, size, PROT_READ | PROT_WRITE,
1267+
MAP_SHARED | MAP_POPULATE, memfd, 0);
1268+
if (base == MAP_FAILED) {
1269+
reply = -1;
1270+
close(memfd);
1271+
goto out;
1272+
}
1273+
1274+
/* Register server's VA range with kernel for PFN matching */
1275+
ret = ublk_ctrl_reg_buf(dev, base, size);
1276+
if (ret < 0) {
1277+
ublk_dbg(UBLK_DBG_DEV,
1278+
"shmem_zc: kernel reg failed %d\n", ret);
1279+
munmap(base, size);
1280+
close(memfd);
1281+
reply = ret;
1282+
goto out;
1283+
}
1284+
1285+
/* Store in table for I/O handling */
1286+
idx = ublk_shmem_register(memfd);
1287+
if (idx >= 0) {
1288+
shmem_table[idx].mmap_base = base;
1289+
shmem_table[idx].size = size;
1290+
}
1291+
reply = idx;
1292+
out:
1293+
send(client_fd, &reply, sizeof(reply), 0);
1294+
close(client_fd);
1295+
}
1296+
1297+
struct shmem_listener_info {
1298+
int dev_id;
1299+
int stop_efd; /* eventfd to signal listener to stop */
1300+
int sock_fd; /* listener socket fd (output) */
1301+
struct ublk_dev *dev;
1302+
};
1303+
1304+
/*
1305+
* Socket listener thread: runs in the parent daemon context alongside
1306+
* the I/O threads. Accepts shared memory registration requests from
1307+
* clients via SCM_RIGHTS. Exits when stop_efd is signaled.
1308+
*/
1309+
static void *ublk_shmem_listener_fn(void *data)
1310+
{
1311+
struct shmem_listener_info *info = data;
1312+
struct pollfd pfds[2];
1313+
1314+
info->sock_fd = ublk_shmem_sock_create(info->dev_id);
1315+
if (info->sock_fd < 0)
1316+
return NULL;
1317+
1318+
pfds[0].fd = info->sock_fd;
1319+
pfds[0].events = POLLIN;
1320+
pfds[1].fd = info->stop_efd;
1321+
pfds[1].events = POLLIN;
1322+
1323+
while (1) {
1324+
int ret = poll(pfds, 2, -1);
1325+
1326+
if (ret < 0)
1327+
break;
1328+
1329+
/* Stop signal from parent */
1330+
if (pfds[1].revents & POLLIN)
1331+
break;
1332+
1333+
/* Client connection */
1334+
if (pfds[0].revents & POLLIN)
1335+
ublk_shmem_handle_client(info->sock_fd, info->dev);
1336+
}
1337+
1338+
return NULL;
1339+
}
1340+
1341+
static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx,
1342+
struct ublk_dev *dev)
1343+
{
1344+
int fd, idx, ret;
1345+
struct stat st;
1346+
void *base;
1347+
1348+
fd = open(ctx->htlb_path, O_RDWR);
1349+
if (fd < 0) {
1350+
ublk_err("htlb: can't open %s\n", ctx->htlb_path);
1351+
return -errno;
1352+
}
1353+
1354+
if (fstat(fd, &st) < 0 || st.st_size <= 0) {
1355+
ublk_err("htlb: invalid file size\n");
1356+
close(fd);
1357+
return -EINVAL;
1358+
}
1359+
1360+
base = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE,
1361+
MAP_SHARED | MAP_POPULATE, fd, 0);
1362+
if (base == MAP_FAILED) {
1363+
ublk_err("htlb: mmap failed\n");
1364+
close(fd);
1365+
return -ENOMEM;
1366+
}
1367+
1368+
ret = ublk_ctrl_reg_buf(dev, base, st.st_size);
1369+
if (ret < 0) {
1370+
ublk_err("htlb: reg_buf failed: %d\n", ret);
1371+
munmap(base, st.st_size);
1372+
close(fd);
1373+
return ret;
1374+
}
1375+
1376+
if (shmem_count >= UBLK_BUF_MAX) {
1377+
munmap(base, st.st_size);
1378+
close(fd);
1379+
return -ENOMEM;
1380+
}
1381+
1382+
idx = shmem_count++;
1383+
shmem_table[idx].fd = fd;
1384+
shmem_table[idx].mmap_base = base;
1385+
shmem_table[idx].size = st.st_size;
1386+
1387+
ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n",
1388+
idx, (size_t)st.st_size);
1389+
return 0;
1390+
}
1391+
10951392
static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
10961393
{
10971394
const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info;
1395+
struct shmem_listener_info linfo = {};
10981396
struct ublk_thread_info *tinfo;
10991397
unsigned long long extra_flags = 0;
11001398
cpu_set_t *affinity_buf;
11011399
unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL;
1400+
uint64_t stop_val = 1;
1401+
pthread_t listener;
11021402
void *thread_ret;
11031403
sem_t ready;
11041404
int ret, i;
@@ -1187,15 +1487,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev)
11871487
goto fail_start;
11881488
}
11891489

1490+
if (ctx->htlb_path) {
1491+
ret = ublk_shmem_htlb_setup(ctx, dev);
1492+
if (ret < 0) {
1493+
ublk_err("htlb setup failed: %d\n", ret);
1494+
ublk_ctrl_stop_dev(dev);
1495+
goto fail_start;
1496+
}
1497+
}
1498+
11901499
ublk_ctrl_get_info(dev);
11911500
if (ctx->fg)
11921501
ublk_ctrl_dump(dev);
11931502
else
11941503
ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id);
11951504
fail_start:
1196-
/* wait until we are terminated */
1197-
for (i = 0; i < dev->nthreads; i++)
1505+
/*
1506+
* Wait for I/O threads to exit. While waiting, a listener
1507+
* thread accepts shared memory registration requests from
1508+
* clients via a per-device unix socket (SCM_RIGHTS fd passing).
1509+
*/
1510+
linfo.dev_id = dinfo->dev_id;
1511+
linfo.dev = dev;
1512+
linfo.stop_efd = eventfd(0, 0);
1513+
if (linfo.stop_efd >= 0)
1514+
pthread_create(&listener, NULL,
1515+
ublk_shmem_listener_fn, &linfo);
1516+
1517+
for (i = 0; i < (int)dev->nthreads; i++)
11981518
pthread_join(tinfo[i].thread, &thread_ret);
1519+
1520+
/* Signal listener thread to stop and wait for it */
1521+
if (linfo.stop_efd >= 0) {
1522+
write(linfo.stop_efd, &stop_val, sizeof(stop_val));
1523+
pthread_join(listener, NULL);
1524+
close(linfo.stop_efd);
1525+
ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd);
1526+
}
1527+
ublk_shmem_unregister_all();
11991528
free(tinfo);
12001529
fail:
12011530
for (i = 0; i < dinfo->nr_hw_queues; i++)
@@ -1625,6 +1954,7 @@ static int cmd_dev_get_features(void)
16251954
FEAT_NAME(UBLK_F_SAFE_STOP_DEV),
16261955
FEAT_NAME(UBLK_F_BATCH_IO),
16271956
FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN),
1957+
FEAT_NAME(UBLK_F_SHMEM_ZC),
16281958
};
16291959
struct ublk_dev *dev;
16301960
__u64 features = 0;
@@ -1797,6 +2127,8 @@ int main(int argc, char *argv[])
17972127
{ "safe", 0, NULL, 0 },
17982128
{ "batch", 0, NULL, 'b'},
17992129
{ "no_auto_part_scan", 0, NULL, 0 },
2130+
{ "shmem_zc", 0, NULL, 0 },
2131+
{ "htlb", 1, NULL, 0 },
18002132
{ 0, 0, 0, 0 }
18012133
};
18022134
const struct ublk_tgt_ops *ops = NULL;
@@ -1912,6 +2244,10 @@ int main(int argc, char *argv[])
19122244
ctx.safe_stop = 1;
19132245
if (!strcmp(longopts[option_idx].name, "no_auto_part_scan"))
19142246
ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN;
2247+
if (!strcmp(longopts[option_idx].name, "shmem_zc"))
2248+
ctx.flags |= UBLK_F_SHMEM_ZC;
2249+
if (!strcmp(longopts[option_idx].name, "htlb"))
2250+
ctx.htlb_path = strdup(optarg);
19152251
break;
19162252
case '?':
19172253
/*

0 commit comments

Comments
 (0)