|
4 | 4 | */ |
5 | 5 |
|
6 | 6 | #include <linux/fs.h> |
| 7 | +#include <sys/un.h> |
7 | 8 | #include "kublk.h" |
8 | 9 |
|
9 | 10 | #define MAX_NR_TGT_ARG 64 |
@@ -1092,13 +1093,312 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, |
1092 | 1093 | } |
1093 | 1094 |
|
1094 | 1095 |
|
| 1096 | +/* |
| 1097 | + * Shared memory registration socket listener. |
| 1098 | + * |
| 1099 | + * The parent daemon context listens on a per-device unix socket at |
| 1100 | + * /run/ublk/ublkb<dev_id>.sock for shared memory registration requests |
| 1101 | + * from clients. Clients send a memfd via SCM_RIGHTS; the server |
| 1102 | + * registers it with the kernel, mmaps it, and returns the assigned index. |
| 1103 | + */ |
| 1104 | +#define UBLK_SHMEM_SOCK_DIR "/run/ublk" |
| 1105 | + |
| 1106 | +/* defined in kublk.h, shared with file_backed.c (loop target) */ |
| 1107 | +struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; |
| 1108 | +int shmem_count; |
| 1109 | + |
| 1110 | +static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len) |
| 1111 | +{ |
| 1112 | + snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id); |
| 1113 | +} |
| 1114 | + |
| 1115 | +static int ublk_shmem_sock_create(int dev_id) |
| 1116 | +{ |
| 1117 | + struct sockaddr_un addr = { .sun_family = AF_UNIX }; |
| 1118 | + char path[108]; |
| 1119 | + int fd; |
| 1120 | + |
| 1121 | + mkdir(UBLK_SHMEM_SOCK_DIR, 0755); |
| 1122 | + ublk_shmem_sock_path(dev_id, path, sizeof(path)); |
| 1123 | + unlink(path); |
| 1124 | + |
| 1125 | + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); |
| 1126 | + if (fd < 0) |
| 1127 | + return -1; |
| 1128 | + |
| 1129 | + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path); |
| 1130 | + if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { |
| 1131 | + close(fd); |
| 1132 | + return -1; |
| 1133 | + } |
| 1134 | + |
| 1135 | + listen(fd, 4); |
| 1136 | + ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path); |
| 1137 | + return fd; |
| 1138 | +} |
| 1139 | + |
| 1140 | +static void ublk_shmem_sock_destroy(int dev_id, int sock_fd) |
| 1141 | +{ |
| 1142 | + char path[108]; |
| 1143 | + |
| 1144 | + if (sock_fd >= 0) |
| 1145 | + close(sock_fd); |
| 1146 | + ublk_shmem_sock_path(dev_id, path, sizeof(path)); |
| 1147 | + unlink(path); |
| 1148 | +} |
| 1149 | + |
| 1150 | +/* Receive a memfd from a client via SCM_RIGHTS */ |
| 1151 | +static int ublk_shmem_recv_fd(int client_fd) |
| 1152 | +{ |
| 1153 | + char buf[1]; |
| 1154 | + struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) }; |
| 1155 | + union { |
| 1156 | + char cmsg_buf[CMSG_SPACE(sizeof(int))]; |
| 1157 | + struct cmsghdr align; |
| 1158 | + } u; |
| 1159 | + struct msghdr msg = { |
| 1160 | + .msg_iov = &iov, |
| 1161 | + .msg_iovlen = 1, |
| 1162 | + .msg_control = u.cmsg_buf, |
| 1163 | + .msg_controllen = sizeof(u.cmsg_buf), |
| 1164 | + }; |
| 1165 | + struct cmsghdr *cmsg; |
| 1166 | + |
| 1167 | + if (recvmsg(client_fd, &msg, 0) <= 0) |
| 1168 | + return -1; |
| 1169 | + |
| 1170 | + cmsg = CMSG_FIRSTHDR(&msg); |
| 1171 | + if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || |
| 1172 | + cmsg->cmsg_type != SCM_RIGHTS) |
| 1173 | + return -1; |
| 1174 | + |
| 1175 | + return *(int *)CMSG_DATA(cmsg); |
| 1176 | +} |
| 1177 | + |
| 1178 | +/* Register a shared memory buffer: store fd, mmap it, return index */ |
| 1179 | +static int ublk_shmem_register(int shmem_fd) |
| 1180 | +{ |
| 1181 | + off_t size; |
| 1182 | + void *base; |
| 1183 | + int idx; |
| 1184 | + |
| 1185 | + if (shmem_count >= UBLK_BUF_MAX) |
| 1186 | + return -1; |
| 1187 | + |
| 1188 | + size = lseek(shmem_fd, 0, SEEK_END); |
| 1189 | + if (size <= 0) |
| 1190 | + return -1; |
| 1191 | + |
| 1192 | + base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, |
| 1193 | + shmem_fd, 0); |
| 1194 | + if (base == MAP_FAILED) |
| 1195 | + return -1; |
| 1196 | + |
| 1197 | + idx = shmem_count++; |
| 1198 | + shmem_table[idx].fd = shmem_fd; |
| 1199 | + shmem_table[idx].mmap_base = base; |
| 1200 | + shmem_table[idx].size = size; |
| 1201 | + |
| 1202 | + ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n", |
| 1203 | + idx, shmem_fd, (size_t)size); |
| 1204 | + return idx; |
| 1205 | +} |
| 1206 | + |
| 1207 | +static void ublk_shmem_unregister_all(void) |
| 1208 | +{ |
| 1209 | + int i; |
| 1210 | + |
| 1211 | + for (i = 0; i < shmem_count; i++) { |
| 1212 | + if (shmem_table[i].mmap_base) { |
| 1213 | + munmap(shmem_table[i].mmap_base, |
| 1214 | + shmem_table[i].size); |
| 1215 | + close(shmem_table[i].fd); |
| 1216 | + shmem_table[i].mmap_base = NULL; |
| 1217 | + } |
| 1218 | + } |
| 1219 | + shmem_count = 0; |
| 1220 | +} |
| 1221 | + |
| 1222 | +static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size) |
| 1223 | +{ |
| 1224 | + struct ublk_shmem_buf_reg buf_reg = { |
| 1225 | + .addr = (unsigned long)addr, |
| 1226 | + .len = size, |
| 1227 | + }; |
| 1228 | + struct ublk_ctrl_cmd_data data = { |
| 1229 | + .cmd_op = UBLK_U_CMD_REG_BUF, |
| 1230 | + .flags = CTRL_CMD_HAS_BUF, |
| 1231 | + .addr = (unsigned long)&buf_reg, |
| 1232 | + .len = sizeof(buf_reg), |
| 1233 | + }; |
| 1234 | + |
| 1235 | + return __ublk_ctrl_cmd(dev, &data); |
| 1236 | +} |
| 1237 | + |
| 1238 | +/* |
| 1239 | + * Handle one client connection: receive memfd, mmap it, register |
| 1240 | + * the VA range with kernel, send back the assigned index. |
| 1241 | + */ |
| 1242 | +static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev) |
| 1243 | +{ |
| 1244 | + int client_fd, memfd, idx, ret; |
| 1245 | + int32_t reply; |
| 1246 | + off_t size; |
| 1247 | + void *base; |
| 1248 | + |
| 1249 | + client_fd = accept(sock_fd, NULL, NULL); |
| 1250 | + if (client_fd < 0) |
| 1251 | + return; |
| 1252 | + |
| 1253 | + memfd = ublk_shmem_recv_fd(client_fd); |
| 1254 | + if (memfd < 0) { |
| 1255 | + reply = -1; |
| 1256 | + goto out; |
| 1257 | + } |
| 1258 | + |
| 1259 | + /* mmap the memfd in server address space */ |
| 1260 | + size = lseek(memfd, 0, SEEK_END); |
| 1261 | + if (size <= 0) { |
| 1262 | + reply = -1; |
| 1263 | + close(memfd); |
| 1264 | + goto out; |
| 1265 | + } |
| 1266 | + base = mmap(NULL, size, PROT_READ | PROT_WRITE, |
| 1267 | + MAP_SHARED | MAP_POPULATE, memfd, 0); |
| 1268 | + if (base == MAP_FAILED) { |
| 1269 | + reply = -1; |
| 1270 | + close(memfd); |
| 1271 | + goto out; |
| 1272 | + } |
| 1273 | + |
| 1274 | + /* Register server's VA range with kernel for PFN matching */ |
| 1275 | + ret = ublk_ctrl_reg_buf(dev, base, size); |
| 1276 | + if (ret < 0) { |
| 1277 | + ublk_dbg(UBLK_DBG_DEV, |
| 1278 | + "shmem_zc: kernel reg failed %d\n", ret); |
| 1279 | + munmap(base, size); |
| 1280 | + close(memfd); |
| 1281 | + reply = ret; |
| 1282 | + goto out; |
| 1283 | + } |
| 1284 | + |
| 1285 | + /* Store in table for I/O handling */ |
| 1286 | + idx = ublk_shmem_register(memfd); |
| 1287 | + if (idx >= 0) { |
| 1288 | + shmem_table[idx].mmap_base = base; |
| 1289 | + shmem_table[idx].size = size; |
| 1290 | + } |
| 1291 | + reply = idx; |
| 1292 | +out: |
| 1293 | + send(client_fd, &reply, sizeof(reply), 0); |
| 1294 | + close(client_fd); |
| 1295 | +} |
| 1296 | + |
| 1297 | +struct shmem_listener_info { |
| 1298 | + int dev_id; |
| 1299 | + int stop_efd; /* eventfd to signal listener to stop */ |
| 1300 | + int sock_fd; /* listener socket fd (output) */ |
| 1301 | + struct ublk_dev *dev; |
| 1302 | +}; |
| 1303 | + |
| 1304 | +/* |
| 1305 | + * Socket listener thread: runs in the parent daemon context alongside |
| 1306 | + * the I/O threads. Accepts shared memory registration requests from |
| 1307 | + * clients via SCM_RIGHTS. Exits when stop_efd is signaled. |
| 1308 | + */ |
| 1309 | +static void *ublk_shmem_listener_fn(void *data) |
| 1310 | +{ |
| 1311 | + struct shmem_listener_info *info = data; |
| 1312 | + struct pollfd pfds[2]; |
| 1313 | + |
| 1314 | + info->sock_fd = ublk_shmem_sock_create(info->dev_id); |
| 1315 | + if (info->sock_fd < 0) |
| 1316 | + return NULL; |
| 1317 | + |
| 1318 | + pfds[0].fd = info->sock_fd; |
| 1319 | + pfds[0].events = POLLIN; |
| 1320 | + pfds[1].fd = info->stop_efd; |
| 1321 | + pfds[1].events = POLLIN; |
| 1322 | + |
| 1323 | + while (1) { |
| 1324 | + int ret = poll(pfds, 2, -1); |
| 1325 | + |
| 1326 | + if (ret < 0) |
| 1327 | + break; |
| 1328 | + |
| 1329 | + /* Stop signal from parent */ |
| 1330 | + if (pfds[1].revents & POLLIN) |
| 1331 | + break; |
| 1332 | + |
| 1333 | + /* Client connection */ |
| 1334 | + if (pfds[0].revents & POLLIN) |
| 1335 | + ublk_shmem_handle_client(info->sock_fd, info->dev); |
| 1336 | + } |
| 1337 | + |
| 1338 | + return NULL; |
| 1339 | +} |
| 1340 | + |
| 1341 | +static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, |
| 1342 | + struct ublk_dev *dev) |
| 1343 | +{ |
| 1344 | + int fd, idx, ret; |
| 1345 | + struct stat st; |
| 1346 | + void *base; |
| 1347 | + |
| 1348 | + fd = open(ctx->htlb_path, O_RDWR); |
| 1349 | + if (fd < 0) { |
| 1350 | + ublk_err("htlb: can't open %s\n", ctx->htlb_path); |
| 1351 | + return -errno; |
| 1352 | + } |
| 1353 | + |
| 1354 | + if (fstat(fd, &st) < 0 || st.st_size <= 0) { |
| 1355 | + ublk_err("htlb: invalid file size\n"); |
| 1356 | + close(fd); |
| 1357 | + return -EINVAL; |
| 1358 | + } |
| 1359 | + |
| 1360 | + base = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, |
| 1361 | + MAP_SHARED | MAP_POPULATE, fd, 0); |
| 1362 | + if (base == MAP_FAILED) { |
| 1363 | + ublk_err("htlb: mmap failed\n"); |
| 1364 | + close(fd); |
| 1365 | + return -ENOMEM; |
| 1366 | + } |
| 1367 | + |
| 1368 | + ret = ublk_ctrl_reg_buf(dev, base, st.st_size); |
| 1369 | + if (ret < 0) { |
| 1370 | + ublk_err("htlb: reg_buf failed: %d\n", ret); |
| 1371 | + munmap(base, st.st_size); |
| 1372 | + close(fd); |
| 1373 | + return ret; |
| 1374 | + } |
| 1375 | + |
| 1376 | + if (shmem_count >= UBLK_BUF_MAX) { |
| 1377 | + munmap(base, st.st_size); |
| 1378 | + close(fd); |
| 1379 | + return -ENOMEM; |
| 1380 | + } |
| 1381 | + |
| 1382 | + idx = shmem_count++; |
| 1383 | + shmem_table[idx].fd = fd; |
| 1384 | + shmem_table[idx].mmap_base = base; |
| 1385 | + shmem_table[idx].size = st.st_size; |
| 1386 | + |
| 1387 | + ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n", |
| 1388 | + idx, (size_t)st.st_size); |
| 1389 | + return 0; |
| 1390 | +} |
| 1391 | + |
1095 | 1392 | static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) |
1096 | 1393 | { |
1097 | 1394 | const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; |
| 1395 | + struct shmem_listener_info linfo = {}; |
1098 | 1396 | struct ublk_thread_info *tinfo; |
1099 | 1397 | unsigned long long extra_flags = 0; |
1100 | 1398 | cpu_set_t *affinity_buf; |
1101 | 1399 | unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; |
| 1400 | + uint64_t stop_val = 1; |
| 1401 | + pthread_t listener; |
1102 | 1402 | void *thread_ret; |
1103 | 1403 | sem_t ready; |
1104 | 1404 | int ret, i; |
@@ -1187,15 +1487,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) |
1187 | 1487 | goto fail_start; |
1188 | 1488 | } |
1189 | 1489 |
|
| 1490 | + if (ctx->htlb_path) { |
| 1491 | + ret = ublk_shmem_htlb_setup(ctx, dev); |
| 1492 | + if (ret < 0) { |
| 1493 | + ublk_err("htlb setup failed: %d\n", ret); |
| 1494 | + ublk_ctrl_stop_dev(dev); |
| 1495 | + goto fail_start; |
| 1496 | + } |
| 1497 | + } |
| 1498 | + |
1190 | 1499 | ublk_ctrl_get_info(dev); |
1191 | 1500 | if (ctx->fg) |
1192 | 1501 | ublk_ctrl_dump(dev); |
1193 | 1502 | else |
1194 | 1503 | ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); |
1195 | 1504 | fail_start: |
1196 | | - /* wait until we are terminated */ |
1197 | | - for (i = 0; i < dev->nthreads; i++) |
| 1505 | + /* |
| 1506 | + * Wait for I/O threads to exit. While waiting, a listener |
| 1507 | + * thread accepts shared memory registration requests from |
| 1508 | + * clients via a per-device unix socket (SCM_RIGHTS fd passing). |
| 1509 | + */ |
| 1510 | + linfo.dev_id = dinfo->dev_id; |
| 1511 | + linfo.dev = dev; |
| 1512 | + linfo.stop_efd = eventfd(0, 0); |
| 1513 | + if (linfo.stop_efd >= 0) |
| 1514 | + pthread_create(&listener, NULL, |
| 1515 | + ublk_shmem_listener_fn, &linfo); |
| 1516 | + |
| 1517 | + for (i = 0; i < (int)dev->nthreads; i++) |
1198 | 1518 | pthread_join(tinfo[i].thread, &thread_ret); |
| 1519 | + |
| 1520 | + /* Signal listener thread to stop and wait for it */ |
| 1521 | + if (linfo.stop_efd >= 0) { |
| 1522 | + write(linfo.stop_efd, &stop_val, sizeof(stop_val)); |
| 1523 | + pthread_join(listener, NULL); |
| 1524 | + close(linfo.stop_efd); |
| 1525 | + ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd); |
| 1526 | + } |
| 1527 | + ublk_shmem_unregister_all(); |
1199 | 1528 | free(tinfo); |
1200 | 1529 | fail: |
1201 | 1530 | for (i = 0; i < dinfo->nr_hw_queues; i++) |
@@ -1625,6 +1954,7 @@ static int cmd_dev_get_features(void) |
1625 | 1954 | FEAT_NAME(UBLK_F_SAFE_STOP_DEV), |
1626 | 1955 | FEAT_NAME(UBLK_F_BATCH_IO), |
1627 | 1956 | FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), |
| 1957 | + FEAT_NAME(UBLK_F_SHMEM_ZC), |
1628 | 1958 | }; |
1629 | 1959 | struct ublk_dev *dev; |
1630 | 1960 | __u64 features = 0; |
@@ -1797,6 +2127,8 @@ int main(int argc, char *argv[]) |
1797 | 2127 | { "safe", 0, NULL, 0 }, |
1798 | 2128 | { "batch", 0, NULL, 'b'}, |
1799 | 2129 | { "no_auto_part_scan", 0, NULL, 0 }, |
| 2130 | + { "shmem_zc", 0, NULL, 0 }, |
| 2131 | + { "htlb", 1, NULL, 0 }, |
1800 | 2132 | { 0, 0, 0, 0 } |
1801 | 2133 | }; |
1802 | 2134 | const struct ublk_tgt_ops *ops = NULL; |
@@ -1912,6 +2244,10 @@ int main(int argc, char *argv[]) |
1912 | 2244 | ctx.safe_stop = 1; |
1913 | 2245 | if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) |
1914 | 2246 | ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; |
| 2247 | + if (!strcmp(longopts[option_idx].name, "shmem_zc")) |
| 2248 | + ctx.flags |= UBLK_F_SHMEM_ZC; |
| 2249 | + if (!strcmp(longopts[option_idx].name, "htlb")) |
| 2250 | + ctx.htlb_path = strdup(optarg); |
1915 | 2251 | break; |
1916 | 2252 | case '?': |
1917 | 2253 | /* |
|
0 commit comments