Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

- Unify how task_work cancelations are detected, placing it in the
task_work running state rather than needing to check the task state

- Series cleaning up and moving the cancelation code to where it
belongs, in cancel.c

- Cleanup of waitid and futex argument handling

- Add support for mixed sized SQEs. 6.18 added support for mixed sized
CQEs, improving flexibility and efficiency of workloads that need big
CQEs. This adds similar support for SQEs, where the occasional need
for a 128b SQE doesn't necessitate having all SQEs be 128b in size

- Introduce zcrx and SQ/CQ layout queries. The former returns what zcrx
features are available. And both return the ring size information to
help with allocation size calculation for user provided rings like
IORING_SETUP_NO_MMAP and IORING_MEM_REGION_TYPE_USER

- Zcrx updates for 6.19. It includes a bunch of small patches,
IORING_REGISTER_ZCRX_CTRL and RQ flushing and David's work on sharing
zcrx b/w multiple io_uring instances

- Series cleaning up ring initializations, notable deduplicating ring
size and offset calculations. It also moves most of the checking
before doing any allocations, making the code simpler

- Add support for getsockname and getpeername, which is mostly a
trivial hookup after a bit of refactoring on the networking side

- Various fixes and cleanups

* tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
io_uring: Introduce getsockname io_uring cmd
socket: Split out a getsockname helper for io_uring
socket: Unify getsockname and getpeername implementation
io_uring/query: drop unused io_handle_query_entry() ctx arg
io_uring/kbuf: remove obsolete buf_nr_pages and update comments
io_uring/register: use correct location for io_rings_layout
io_uring/zcrx: share an ifq between rings
io_uring/zcrx: add io_fill_zcrx_offsets()
io_uring/zcrx: export zcrx via a file
io_uring/zcrx: move io_zcrx_scrub() and dependencies up
io_uring/zcrx: count zcrx users
io_uring/zcrx: add sync refill queue flushing
io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL
io_uring/zcrx: elide passing msg flags
io_uring/zcrx: use folio_nr_pages() instead of shift operation
io_uring/zcrx: convert to use netmem_desc
io_uring/query: introduce rings info query
io_uring/query: introduce zcrx query
io_uring: move cq/sq user offset init around
io_uring: pre-calculate scq layout
...

+1308 -865
+4 -2
block/ioctl.c
··· 769 769 bool nowait; 770 770 }; 771 771 772 - static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) 772 + static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw) 773 773 { 774 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 774 775 struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); 775 776 776 777 if (bic->res == -EAGAIN && bic->nowait) 777 778 io_uring_cmd_issue_blocking(cmd); 778 779 else 779 - io_uring_cmd_done(cmd, bic->res, issue_flags); 780 + io_uring_cmd_done(cmd, bic->res, 781 + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); 780 782 } 781 783 782 784 static void bio_cmd_bio_end_io(struct bio *bio)
+11 -11
drivers/block/ublk_drv.c
··· 1302 1302 return true; 1303 1303 } 1304 1304 1305 - static void ublk_dispatch_req(struct ublk_queue *ubq, 1306 - struct request *req, 1307 - unsigned int issue_flags) 1305 + static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) 1308 1306 { 1307 + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1309 1308 int tag = req->tag; 1310 1309 struct ublk_io *io = &ubq->ios[tag]; 1311 1310 ··· 1347 1348 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); 1348 1349 } 1349 1350 1350 - static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, 1351 - unsigned int issue_flags) 1351 + static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 1352 1352 { 1353 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1353 1354 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1354 1355 struct ublk_queue *ubq = pdu->ubq; 1355 1356 1356 - ublk_dispatch_req(ubq, pdu->req, issue_flags); 1357 + ublk_dispatch_req(ubq, pdu->req); 1357 1358 } 1358 1359 1359 1360 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) ··· 1365 1366 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); 1366 1367 } 1367 1368 1368 - static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, 1369 - unsigned int issue_flags) 1369 + static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 1370 1370 { 1371 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1371 1372 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1372 1373 struct request *rq = pdu->req_list; 1373 1374 struct request *next; ··· 1375 1376 do { 1376 1377 next = rq->rq_next; 1377 1378 rq->rq_next = NULL; 1378 - ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); 1379 + ublk_dispatch_req(rq->mq_hctx->driver_data, rq); 1379 1380 rq = next; 1380 1381 } while (rq); 1381 1382 } ··· 2522 2523 return NULL; 2523 2524 } 2524 2525 2525 - static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, 2526 - unsigned int issue_flags) 2526 + static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) 2527 2527 { 2528 + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 2529 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 2528 2530 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 2529 2531 2530 2532 if (ret != -EIOCBQUEUED)
+4 -3
drivers/nvme/host/ioctl.c
··· 398 398 return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu); 399 399 } 400 400 401 - static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, 402 - unsigned issue_flags) 401 + static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) 403 402 { 403 + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); 404 404 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 405 405 406 406 if (pdu->bio) 407 407 blk_rq_unmap_user(pdu->bio); 408 - io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); 408 + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 409 + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); 409 410 } 410 411 411 412 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
+3 -2
fs/btrfs/ioctl.c
··· 4632 4632 struct btrfs_uring_priv *priv; 4633 4633 }; 4634 4634 4635 - static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) 4635 + static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw) 4636 4636 { 4637 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 4637 4638 struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); 4638 4639 struct btrfs_uring_priv *priv = bc->priv; 4639 4640 struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); ··· 4679 4678 btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); 4680 4679 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 4681 4680 4682 - io_uring_cmd_done(cmd, ret, issue_flags); 4681 + io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); 4683 4682 add_rchar(current, ret); 4684 4683 4685 4684 for (index = 0; index < priv->nr_pages; index++)
+4 -3
fs/fuse/dev_uring.c
··· 1209 1209 * User buffers are not mapped yet - the application does not have permission 1210 1210 * to write to it - this has to be executed in ring task context. 1211 1211 */ 1212 - static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, 1213 - unsigned int issue_flags) 1212 + static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) 1214 1213 { 1214 + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1215 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1215 1216 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1216 1217 struct fuse_ring_queue *queue = ent->queue; 1217 1218 int err; 1218 1219 1219 - if (!(issue_flags & IO_URING_F_TASK_DEAD)) { 1220 + if (!tw.cancel) { 1220 1221 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1221 1222 if (err) { 1222 1223 fuse_uring_next_fuse_req(ent, queue, issue_flags);
+13 -9
include/linux/io_uring/cmd.h
··· 11 11 /* io_uring_cmd is being issued again */ 12 12 #define IORING_URING_CMD_REISSUE (1U << 31) 13 13 14 - typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, 15 - unsigned issue_flags); 16 - 17 14 struct io_uring_cmd { 18 15 struct file *file; 19 16 const struct io_uring_sqe *sqe; 20 - /* callback to defer completions to task context */ 21 - io_uring_cmd_tw_t task_work_cb; 22 17 u32 cmd_op; 23 18 u32 flags; 24 19 u8 pdu[32]; /* available inline for free use */ 20 + u8 unused[8]; 25 21 }; 26 22 27 23 static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) ··· 56 60 unsigned issue_flags, bool is_cqe32); 57 61 58 62 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 59 - io_uring_cmd_tw_t task_work_cb, 63 + io_req_tw_func_t task_work_cb, 60 64 unsigned flags); 61 65 62 66 /* ··· 105 109 { 106 110 } 107 111 static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 108 - io_uring_cmd_tw_t task_work_cb, unsigned flags) 112 + io_req_tw_func_t task_work_cb, unsigned flags) 109 113 { 110 114 } 111 115 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, ··· 128 132 } 129 133 #endif 130 134 135 + static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) 136 + { 137 + return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd); 138 + } 139 + 140 + /* task_work executor checks the deferred list completion */ 141 + #define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER 142 + 131 143 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ 132 144 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 133 - io_uring_cmd_tw_t task_work_cb) 145 + io_req_tw_func_t task_work_cb) 134 146 { 135 147 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); 136 148 } 137 149 138 150 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 139 - io_uring_cmd_tw_t task_work_cb) 151 + io_req_tw_func_t task_work_cb) 140 152 { 141 153 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); 142 154 }
+8 -4
include/linux/io_uring_types.h
··· 39 39 /* set when uring wants to cancel a previously issued command */ 40 40 IO_URING_F_CANCEL = (1 << 11), 41 41 IO_URING_F_COMPAT = (1 << 12), 42 - IO_URING_F_TASK_DEAD = (1 << 13), 43 42 }; 44 43 45 44 struct io_wq_work_node { ··· 327 328 328 329 /* 329 330 * Modifications are protected by ->uring_lock and ->mmap_lock. 330 - * The flags, buf_pages and buf_nr_pages fields should be stable 331 - * once published. 331 + * The buffer list's io mapped region should be stable once 332 + * published. 332 333 */ 333 334 struct xarray io_bl_xa; 334 335 ··· 473 474 * ONLY core io_uring.c should instantiate this struct. 474 475 */ 475 476 struct io_tw_state { 477 + bool cancel; 476 478 }; 477 479 /* Alias to use in code that doesn't instantiate struct io_tw_state */ 478 480 typedef struct io_tw_state io_tw_token_t; ··· 614 614 REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), 615 615 }; 616 616 617 - typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); 617 + struct io_tw_req { 618 + struct io_kiocb *req; 619 + }; 620 + 621 + typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw); 618 622 619 623 struct io_task_work { 620 624 struct llist_node node;
+1
include/linux/netdevice.h
··· 3417 3417 struct net_device *__dev_get_by_index(struct net *net, int ifindex); 3418 3418 struct net_device *netdev_get_by_index(struct net *net, int ifindex, 3419 3419 netdevice_tracker *tracker, gfp_t gfp); 3420 + struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); 3420 3421 struct net_device *netdev_get_by_name(struct net *net, const char *name, 3421 3422 netdevice_tracker *tracker, gfp_t gfp); 3422 3423 struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
+3 -3
include/linux/socket.h
··· 468 468 int addrlen); 469 469 extern int __sys_listen(int fd, int backlog); 470 470 extern int __sys_listen_socket(struct socket *sock, int backlog); 471 + extern int do_getsockname(struct socket *sock, int peer, 472 + struct sockaddr __user *usockaddr, int __user *usockaddr_len); 471 473 extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, 472 - int __user *usockaddr_len); 473 - extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, 474 - int __user *usockaddr_len); 474 + int __user *usockaddr_len, int peer); 475 475 extern int __sys_socketpair(int family, int type, int protocol, 476 476 int __user *usockvec); 477 477 extern int __sys_shutdown_sock(struct socket *sock, int how);
+43
include/uapi/linux/io_uring.h
··· 231 231 */ 232 232 #define IORING_SETUP_CQE_MIXED (1U << 18) 233 233 234 + /* 235 + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have 236 + * a 128b opcode. 237 + */ 238 + #define IORING_SETUP_SQE_MIXED (1U << 19) 239 + 234 240 enum io_uring_op { 235 241 IORING_OP_NOP, 236 242 IORING_OP_READV, ··· 301 295 IORING_OP_READV_FIXED, 302 296 IORING_OP_WRITEV_FIXED, 303 297 IORING_OP_PIPE, 298 + IORING_OP_NOP128, 299 + IORING_OP_URING_CMD128, 304 300 305 301 /* this goes last, obviously */ 306 302 IORING_OP_LAST, ··· 697 689 /* query various aspects of io_uring, see linux/io_uring/query.h */ 698 690 IORING_REGISTER_QUERY = 35, 699 691 692 + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ 693 + IORING_REGISTER_ZCRX_CTRL = 36, 694 + 700 695 /* this goes last */ 701 696 IORING_REGISTER_LAST, 702 697 ··· 1009 998 SOCKET_URING_OP_GETSOCKOPT, 1010 999 SOCKET_URING_OP_SETSOCKOPT, 1011 1000 SOCKET_URING_OP_TX_TIMESTAMP, 1001 + SOCKET_URING_OP_GETSOCKNAME, 1012 1002 }; 1013 1003 1014 1004 /* ··· 1064 1052 __u64 __resv2[2]; 1065 1053 }; 1066 1054 1055 + enum zcrx_reg_flags { 1056 + ZCRX_REG_IMPORT = 1, 1057 + }; 1058 + 1067 1059 /* 1068 1060 * Argument for IORING_REGISTER_ZCRX_IFQ 1069 1061 */ ··· 1084 1068 __u32 zcrx_id; 1085 1069 __u32 __resv2; 1086 1070 __u64 __resv[3]; 1071 + }; 1072 + 1073 + enum zcrx_ctrl_op { 1074 + ZCRX_CTRL_FLUSH_RQ, 1075 + ZCRX_CTRL_EXPORT, 1076 + 1077 + __ZCRX_CTRL_LAST, 1078 + }; 1079 + 1080 + struct zcrx_ctrl_flush_rq { 1081 + __u64 __resv[6]; 1082 + }; 1083 + 1084 + struct zcrx_ctrl_export { 1085 + __u32 zcrx_fd; 1086 + __u32 __resv1[11]; 1087 + }; 1088 + 1089 + struct zcrx_ctrl { 1090 + __u32 zcrx_id; 1091 + __u32 op; /* see enum zcrx_ctrl_op */ 1092 + __u64 __resv[2]; 1093 + 1094 + union { 1095 + struct zcrx_ctrl_export zc_export; 1096 + struct zcrx_ctrl_flush_rq zc_flush; 1097 + }; 1087 1098 }; 1088 1099 1089 1100 #ifdef __cplusplus
+24
include/uapi/linux/io_uring/query.h
··· 18 18 19 19 enum { 20 20 IO_URING_QUERY_OPCODES = 0, 21 + IO_URING_QUERY_ZCRX = 1, 22 + IO_URING_QUERY_SCQ = 2, 21 23 22 24 __IO_URING_QUERY_MAX, 23 25 }; ··· 41 39 /* The number of available query opcodes */ 42 40 __u32 nr_query_opcodes; 43 41 __u32 __pad; 42 + }; 43 + 44 + struct io_uring_query_zcrx { 45 + /* Bitmask of supported ZCRX_REG_* flags, */ 46 + __u64 register_flags; 47 + /* Bitmask of all supported IORING_ZCRX_AREA_* flags */ 48 + __u64 area_flags; 49 + /* The number of supported ZCRX_CTRL_* opcodes */ 50 + __u32 nr_ctrl_opcodes; 51 + __u32 __resv1; 52 + /* The refill ring header size */ 53 + __u32 rq_hdr_size; 54 + /* The alignment for the header */ 55 + __u32 rq_hdr_alignment; 56 + __u64 __resv2; 57 + }; 58 + 59 + struct io_uring_query_scq { 60 + /* The SQ/CQ rings header size */ 61 + __u64 hdr_size; 62 + /* The alignment for the header */ 63 + __u64 hdr_alignment; 44 64 }; 45 65 46 66 #endif
+270
io_uring/cancel.c
··· 14 14 #include "filetable.h" 15 15 #include "io_uring.h" 16 16 #include "tctx.h" 17 + #include "sqpoll.h" 18 + #include "uring_cmd.h" 17 19 #include "poll.h" 18 20 #include "timeout.h" 19 21 #include "waitid.h" ··· 385 383 } 386 384 io_ring_submit_unlock(ctx, issue_flags); 387 385 return nr ?: -ENOENT; 386 + } 387 + 388 + static bool io_match_linked(struct io_kiocb *head) 389 + { 390 + struct io_kiocb *req; 391 + 392 + io_for_each_link(req, head) { 393 + if (req->flags & REQ_F_INFLIGHT) 394 + return true; 395 + } 396 + return false; 397 + } 398 + 399 + /* 400 + * As io_match_task() but protected against racing with linked timeouts. 401 + * User must not hold timeout_lock. 402 + */ 403 + bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 404 + bool cancel_all) 405 + { 406 + bool matched; 407 + 408 + if (tctx && head->tctx != tctx) 409 + return false; 410 + if (cancel_all) 411 + return true; 412 + 413 + if (head->flags & REQ_F_LINK_TIMEOUT) { 414 + struct io_ring_ctx *ctx = head->ctx; 415 + 416 + /* protect against races with linked timeouts */ 417 + raw_spin_lock_irq(&ctx->timeout_lock); 418 + matched = io_match_linked(head); 419 + raw_spin_unlock_irq(&ctx->timeout_lock); 420 + } else { 421 + matched = io_match_linked(head); 422 + } 423 + return matched; 424 + } 425 + 426 + void __io_uring_cancel(bool cancel_all) 427 + { 428 + io_uring_unreg_ringfd(); 429 + io_uring_cancel_generic(cancel_all, NULL); 430 + } 431 + 432 + struct io_task_cancel { 433 + struct io_uring_task *tctx; 434 + bool all; 435 + }; 436 + 437 + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 438 + { 439 + struct io_kiocb *req = container_of(work, struct io_kiocb, work); 440 + struct io_task_cancel *cancel = data; 441 + 442 + return io_match_task_safe(req, cancel->tctx, cancel->all); 443 + } 444 + 445 + static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 446 + struct io_uring_task *tctx, 447 + bool cancel_all) 448 + { 449 + struct io_defer_entry *de; 450 + LIST_HEAD(list); 451 + 452 + list_for_each_entry_reverse(de, &ctx->defer_list, list) { 453 + if (io_match_task_safe(de->req, tctx, cancel_all)) { 454 + list_cut_position(&list, &ctx->defer_list, &de->list); 455 + break; 456 + } 457 + } 458 + if (list_empty(&list)) 459 + return false; 460 + 461 + while (!list_empty(&list)) { 462 + de = list_first_entry(&list, struct io_defer_entry, list); 463 + list_del_init(&de->list); 464 + ctx->nr_drained -= io_linked_nr(de->req); 465 + io_req_task_queue_fail(de->req, -ECANCELED); 466 + kfree(de); 467 + } 468 + return true; 469 + } 470 + 471 + __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 472 + { 473 + struct io_kiocb *req = container_of(work, struct io_kiocb, work); 474 + 475 + return req->ctx == data; 476 + } 477 + 478 + static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 479 + { 480 + struct io_tctx_node *node; 481 + enum io_wq_cancel cret; 482 + bool ret = false; 483 + 484 + mutex_lock(&ctx->uring_lock); 485 + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 486 + struct io_uring_task *tctx = node->task->io_uring; 487 + 488 + /* 489 + * io_wq will stay alive while we hold uring_lock, because it's 490 + * killed after ctx nodes, which requires to take the lock. 491 + */ 492 + if (!tctx || !tctx->io_wq) 493 + continue; 494 + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 495 + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 496 + } 497 + mutex_unlock(&ctx->uring_lock); 498 + 499 + return ret; 500 + } 501 + 502 + __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 503 + struct io_uring_task *tctx, 504 + bool cancel_all, bool is_sqpoll_thread) 505 + { 506 + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; 507 + enum io_wq_cancel cret; 508 + bool ret = false; 509 + 510 + /* set it so io_req_local_work_add() would wake us up */ 511 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 512 + atomic_set(&ctx->cq_wait_nr, 1); 513 + smp_mb(); 514 + } 515 + 516 + /* failed during ring init, it couldn't have issued any requests */ 517 + if (!ctx->rings) 518 + return false; 519 + 520 + if (!tctx) { 521 + ret |= io_uring_try_cancel_iowq(ctx); 522 + } else if (tctx->io_wq) { 523 + /* 524 + * Cancels requests of all rings, not only @ctx, but 525 + * it's fine as the task is in exit/exec. 526 + */ 527 + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 528 + &cancel, true); 529 + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 530 + } 531 + 532 + /* SQPOLL thread does its own polling */ 533 + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 534 + is_sqpoll_thread) { 535 + while (!wq_list_empty(&ctx->iopoll_list)) { 536 + io_iopoll_try_reap_events(ctx); 537 + ret = true; 538 + cond_resched(); 539 + } 540 + } 541 + 542 + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 543 + io_allowed_defer_tw_run(ctx)) 544 + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; 545 + mutex_lock(&ctx->uring_lock); 546 + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 547 + ret |= io_poll_remove_all(ctx, tctx, cancel_all); 548 + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); 549 + ret |= io_futex_remove_all(ctx, tctx, cancel_all); 550 + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); 551 + mutex_unlock(&ctx->uring_lock); 552 + ret |= io_kill_timeouts(ctx, tctx, cancel_all); 553 + if (tctx) 554 + ret |= io_run_task_work() > 0; 555 + else 556 + ret |= flush_delayed_work(&ctx->fallback_work); 557 + return ret; 558 + } 559 + 560 + static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 561 + { 562 + if (tracked) 563 + return atomic_read(&tctx->inflight_tracked); 564 + return percpu_counter_sum(&tctx->inflight); 565 + } 566 + 567 + /* 568 + * Find any io_uring ctx that this task has registered or done IO on, and cancel 569 + * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 570 + */ 571 + __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 572 + { 573 + struct io_uring_task *tctx = current->io_uring; 574 + struct io_ring_ctx *ctx; 575 + struct io_tctx_node *node; 576 + unsigned long index; 577 + s64 inflight; 578 + DEFINE_WAIT(wait); 579 + 580 + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); 581 + 582 + if (!current->io_uring) 583 + return; 584 + if (tctx->io_wq) 585 + io_wq_exit_start(tctx->io_wq); 586 + 587 + atomic_inc(&tctx->in_cancel); 588 + do { 589 + bool loop = false; 590 + 591 + io_uring_drop_tctx_refs(current); 592 + if (!tctx_inflight(tctx, !cancel_all)) 593 + break; 594 + 595 + /* read completions before cancelations */ 596 + inflight = tctx_inflight(tctx, false); 597 + if (!inflight) 598 + break; 599 + 600 + if (!sqd) { 601 + xa_for_each(&tctx->xa, index, node) { 602 + /* sqpoll task will cancel all its requests */ 603 + if (node->ctx->sq_data) 604 + continue; 605 + loop |= io_uring_try_cancel_requests(node->ctx, 606 + current->io_uring, 607 + cancel_all, 608 + false); 609 + } 610 + } else { 611 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 612 + loop |= io_uring_try_cancel_requests(ctx, 613 + current->io_uring, 614 + cancel_all, 615 + true); 616 + } 617 + 618 + if (loop) { 619 + cond_resched(); 620 + continue; 621 + } 622 + 623 + prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 624 + io_run_task_work(); 625 + io_uring_drop_tctx_refs(current); 626 + xa_for_each(&tctx->xa, index, node) { 627 + if (io_local_work_pending(node->ctx)) { 628 + WARN_ON_ONCE(node->ctx->submitter_task && 629 + node->ctx->submitter_task != current); 630 + goto end_wait; 631 + } 632 + } 633 + /* 634 + * If we've seen completions, retry without waiting. This 635 + * avoids a race where a completion comes in before we did 636 + * prepare_to_wait(). 637 + */ 638 + if (inflight == tctx_inflight(tctx, !cancel_all)) 639 + schedule(); 640 + end_wait: 641 + finish_wait(&tctx->wait, &wait); 642 + } while (1); 643 + 644 + io_uring_clean_tctx(tctx); 645 + if (cancel_all) { 646 + /* 647 + * We shouldn't run task_works after cancel, so just leave 648 + * ->in_cancel set for normal exit. 649 + */ 650 + atomic_dec(&tctx->in_cancel); 651 + /* for exec all current's requests should be gone, kill tctx */ 652 + __io_uring_free(current); 653 + } 388 654 }
+7 -1
io_uring/cancel.h
··· 23 23 24 24 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); 25 25 bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); 26 + bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 27 + bool cancel_all); 26 28 27 29 bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 28 30 struct hlist_head *list, bool cancel_all, 29 31 bool (*cancel)(struct io_kiocb *)); 30 - 31 32 int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 32 33 unsigned int issue_flags, struct hlist_head *list, 33 34 bool (*cancel)(struct io_kiocb *)); 35 + __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 36 + struct io_uring_task *tctx, 37 + bool cancel_all, bool is_sqpoll_thread); 38 + __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 39 + __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data); 34 40 35 41 static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) 36 42 {
+22
io_uring/cmd_net.c
··· 132 132 return -EAGAIN; 133 133 } 134 134 135 + static int io_uring_cmd_getsockname(struct socket *sock, 136 + struct io_uring_cmd *cmd, 137 + unsigned int issue_flags) 138 + { 139 + const struct io_uring_sqe *sqe = cmd->sqe; 140 + struct sockaddr __user *uaddr; 141 + unsigned int peer; 142 + int __user *ulen; 143 + 144 + if (sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags) 145 + return -EINVAL; 146 + 147 + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 148 + ulen = u64_to_user_ptr(sqe->addr3); 149 + peer = READ_ONCE(sqe->optlen); 150 + if (peer > 1) 151 + return -EINVAL; 152 + return do_getsockname(sock, peer, uaddr, ulen); 153 + } 154 + 135 155 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) 136 156 { 137 157 struct socket *sock = cmd->file->private_data; ··· 179 159 return io_uring_cmd_setsockopt(sock, cmd, issue_flags); 180 160 case SOCKET_URING_OP_TX_TIMESTAMP: 181 161 return io_uring_cmd_timestamp(sock, cmd, issue_flags); 162 + case SOCKET_URING_OP_GETSOCKNAME: 163 + return io_uring_cmd_getsockname(sock, cmd, issue_flags); 182 164 default: 183 165 return -EOPNOTSUPP; 184 166 }
+32 -5
io_uring/fdinfo.c
··· 5 5 #include <linux/file.h> 6 6 #include <linux/proc_fs.h> 7 7 #include <linux/seq_file.h> 8 + #include <linux/nospec.h> 8 9 #include <linux/io_uring.h> 9 10 10 11 #include <uapi/linux/io_uring.h> ··· 15 14 #include "fdinfo.h" 16 15 #include "cancel.h" 17 16 #include "rsrc.h" 17 + #include "opdef.h" 18 18 19 19 #ifdef CONFIG_NET_RX_BUSY_POLL 20 20 static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, ··· 95 93 unsigned int entry = i + sq_head; 96 94 struct io_uring_sqe *sqe; 97 95 unsigned int sq_idx; 96 + bool sqe128 = false; 97 + u8 opcode; 98 98 99 99 if (ctx->flags & IORING_SETUP_NO_SQARRAY) 100 - break; 101 - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 100 + sq_idx = entry & sq_mask; 101 + else 102 + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 102 103 if (sq_idx > sq_mask) 103 104 continue; 105 + 104 106 sqe = &ctx->sq_sqes[sq_idx << sq_shift]; 107 + opcode = READ_ONCE(sqe->opcode); 108 + if (opcode >= IORING_OP_LAST) 109 + continue; 110 + opcode = array_index_nospec(opcode, IORING_OP_LAST); 111 + if (sq_shift) { 112 + sqe128 = true; 113 + } else if (io_issue_defs[opcode].is_128) { 114 + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { 115 + seq_printf(m, 116 + "%5u: invalid sqe, 128B entry on non-mixed sq\n", 117 + sq_idx); 118 + break; 119 + } 120 + if ((++sq_head & sq_mask) == 0) { 121 + seq_printf(m, 122 + "%5u: corrupted sqe, wrapping 128B entry\n", 123 + sq_idx); 124 + break; 125 + } 126 + sqe128 = true; 127 + } 105 128 seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " 106 129 "addr:0x%llx, rw_flags:0x%x, buf_index:%d " 107 130 "user_data:%llu", 108 - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, 131 + sq_idx, io_uring_get_opcode(opcode), sqe->fd, 109 132 sqe->flags, (unsigned long long) sqe->off, 110 133 (unsigned long long) sqe->addr, sqe->rw_flags, 111 134 sqe->buf_index, sqe->user_data); 112 - if (sq_shift) { 135 + if (sqe128) { 113 136 u64 *sqeb = (void *) (sqe + 1); 114 137 int size = sizeof(struct io_uring_sqe) / sizeof(u64); 115 138 int j; ··· 155 128 cqe = &r->cqes[(cq_head & cq_mask)]; 156 129 if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) 157 130 cqe32 = true; 158 - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", 131 + seq_printf(m, "%5u: user_data:%llu, res:%d, flags:%x", 159 132 cq_head & cq_mask, cqe->user_data, cqe->res, 160 133 cqe->flags); 161 134 if (cqe32)
+31 -26
io_uring/futex.c
··· 17 17 void __user *uaddr; 18 18 unsigned long futex_val; 19 19 unsigned long futex_mask; 20 - unsigned long futexv_owned; 21 20 u32 futex_flags; 22 21 unsigned int futex_nr; 23 22 bool futexv_unqueued; ··· 25 26 struct io_futex_data { 26 27 struct futex_q q; 27 28 struct io_kiocb *req; 29 + }; 30 + 31 + struct io_futexv_data { 32 + unsigned long owned; 33 + struct futex_vector futexv[]; 28 34 }; 29 35 30 36 #define IO_FUTEX_ALLOC_CACHE_MAX 32 ··· 45 41 io_alloc_cache_free(&ctx->futex_cache, kfree); 46 42 } 47 43 48 - static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 44 + static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) 49 45 { 50 - hlist_del_init(&req->hash_node); 51 - io_req_task_complete(req, tw); 46 + hlist_del_init(&tw_req.req->hash_node); 47 + io_req_task_complete(tw_req, tw); 52 48 } 53 49 54 - static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 50 + static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) 55 51 { 52 + struct io_kiocb *req = tw_req.req; 56 53 struct io_ring_ctx *ctx = req->ctx; 57 54 58 55 io_tw_lock(ctx, tw); 59 56 io_cache_free(&ctx->futex_cache, req->async_data); 60 57 io_req_async_data_clear(req, 0); 61 - __io_futex_complete(req, tw); 58 + __io_futex_complete(tw_req, tw); 62 59 } 63 60 64 - static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) 61 + static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) 65 62 { 63 + struct io_kiocb *req = tw_req.req; 66 64 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 67 - struct futex_vector *futexv = req->async_data; 65 + struct io_futexv_data *ifd = req->async_data; 68 66 69 67 io_tw_lock(req->ctx, tw); 70 68 71 69 if (!iof->futexv_unqueued) { 72 70 int res; 73 71 74 - res = futex_unqueue_multiple(futexv, iof->futex_nr); 72 + res = futex_unqueue_multiple(ifd->futexv, iof->futex_nr); 75 73 if (res != -1) 76 74 io_req_set_res(req, res, 0); 77 75 } 78 76 79 77 io_req_async_data_free(req); 80 - __io_futex_complete(req, tw); 78 + __io_futex_complete(tw_req, tw); 81 79 } 82 80 83 - static bool io_futexv_claim(struct io_futex *iof) 81 + static bool io_futexv_claim(struct io_futexv_data *ifd) 84 82 { 85 - if (test_bit(0, &iof->futexv_owned) || 86 - test_and_set_bit_lock(0, &iof->futexv_owned)) 83 + if (test_bit(0, &ifd->owned) || test_and_set_bit_lock(0, &ifd->owned)) 87 84 return false; 88 85 return true; 89 86 } ··· 99 94 return false; 100 95 req->io_task_work.func = io_futex_complete; 101 96 } else { 102 - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 97 + struct io_futexv_data *ifd = req->async_data; 103 98 104 - if (!io_futexv_claim(iof)) 99 + if (!io_futexv_claim(ifd)) 105 100 return false; 106 101 req->io_task_work.func = io_futexv_complete; 107 102 } ··· 157 152 static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) 158 153 { 159 154 struct io_kiocb *req = q->wake_data; 160 - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 155 + struct io_futexv_data *ifd = req->async_data; 161 156 162 - if (!io_futexv_claim(iof)) 157 + if (!io_futexv_claim(ifd)) 163 158 return; 164 159 if (unlikely(!__futex_wake_mark(q))) 165 160 return; ··· 172 167 int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 173 168 { 174 169 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 175 - struct futex_vector *futexv; 170 + struct io_futexv_data *ifd; 176 171 int ret; 177 172 178 173 /* No flags or mask supported for waitv */ ··· 185 180 if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) 186 181 return -EINVAL; 187 182 188 - futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); 189 - if (!futexv) 183 + ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr), 184 + GFP_KERNEL); 185 + if (!ifd) 190 186 return -ENOMEM; 191 187 192 - ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, 188 + ret = futex_parse_waitv(ifd->futexv, iof->uaddr, iof->futex_nr, 193 189 io_futex_wakev_fn, req); 194 190 if (ret) { 195 - kfree(futexv); 191 + kfree(ifd); 196 192 return ret; 197 193 } 198 194 199 195 /* Mark as inflight, so file exit cancelation will find it */ 200 196 io_req_track_inflight(req); 201 - iof->futexv_owned = 0; 202 197 iof->futexv_unqueued = 0; 203 198 req->flags |= REQ_F_ASYNC_DATA; 204 - req->async_data = futexv; 199 + req->async_data = ifd; 205 200 return 0; 206 201 } 207 202 ··· 221 216 int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) 222 217 { 223 218 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 224 - struct futex_vector *futexv = req->async_data; 219 + struct io_futexv_data *ifd = req->async_data; 225 220 struct io_ring_ctx *ctx = req->ctx; 226 221 int ret, woken = -1; 227 222 228 223 io_ring_submit_lock(ctx, issue_flags); 229 224 230 - ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); 225 + ret = futex_wait_multiple_setup(ifd->futexv, iof->futex_nr, &woken); 231 226 232 227 /* 233 228 * Error case, ret is < 0. Mark the request as failed.
+163 -386
io_uring/io_uring.c
··· 124 124 #define IO_REQ_ALLOC_BATCH 8 125 125 #define IO_LOCAL_TW_DEFAULT_MAX 20 126 126 127 - struct io_defer_entry { 128 - struct list_head list; 129 - struct io_kiocb *req; 130 - }; 131 - 132 127 /* requests with any of those set should undergo io_disarm_next() */ 133 128 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 134 129 ··· 134 139 #define IO_CQ_WAKE_INIT (-1U) 135 140 /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 136 141 #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 137 - 138 - static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 139 - struct io_uring_task *tctx, 140 - bool cancel_all, 141 - bool is_sqpoll_thread); 142 142 143 143 static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); 144 144 static void __io_req_caches_free(struct io_ring_ctx *ctx); ··· 197 207 return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 198 208 } 199 209 200 - static bool io_match_linked(struct io_kiocb *head) 201 - { 202 - struct io_kiocb *req; 203 - 204 - io_for_each_link(req, head) { 205 - if (req->flags & REQ_F_INFLIGHT) 206 - return true; 207 - } 208 - return false; 209 - } 210 - 211 - /* 212 - * As io_match_task() but protected against racing with linked timeouts. 213 - * User must not hold timeout_lock. 214 - */ 215 - bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 216 - bool cancel_all) 217 - { 218 - bool matched; 219 - 220 - if (tctx && head->tctx != tctx) 221 - return false; 222 - if (cancel_all) 223 - return true; 224 - 225 - if (head->flags & REQ_F_LINK_TIMEOUT) { 226 - struct io_ring_ctx *ctx = head->ctx; 227 - 228 - /* protect against races with linked timeouts */ 229 - raw_spin_lock_irq(&ctx->timeout_lock); 230 - matched = io_match_linked(head); 231 - raw_spin_unlock_irq(&ctx->timeout_lock); 232 - } else { 233 - matched = io_match_linked(head); 234 - } 235 - return matched; 236 - } 237 - 238 210 static inline void req_fail_link_node(struct io_kiocb *req, int res) 239 211 { 240 212 req_set_fail(req); ··· 217 265 complete(&ctx->ref_comp); 218 266 } 219 267 268 + /* 269 + * Terminate the request if either of these conditions are true: 270 + * 271 + * 1) It's being executed by the original task, but that task is marked 272 + * with PF_EXITING as it's exiting. 273 + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 274 + * our fallback task_work. 275 + * 3) The ring has been closed and is going away. 276 + */ 277 + static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 278 + { 279 + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); 280 + } 281 + 220 282 static __cold void io_fallback_req_func(struct work_struct *work) 221 283 { 222 284 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, ··· 241 275 242 276 percpu_ref_get(&ctx->refs); 243 277 mutex_lock(&ctx->uring_lock); 278 + ts.cancel = io_should_terminate_tw(ctx); 244 279 llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 245 - req->io_task_work.func(req, ts); 280 + req->io_task_work.func((struct io_tw_req){req}, ts); 246 281 io_submit_flush_completions(ctx); 247 282 mutex_unlock(&ctx->uring_lock); 248 283 percpu_ref_put(&ctx->refs); ··· 491 524 io_wq_enqueue(tctx->io_wq, &req->work); 492 525 } 493 526 494 - static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) 527 + static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) 495 528 { 496 - io_queue_iowq(req); 529 + io_queue_iowq(tw_req.req); 497 530 } 498 531 499 532 void io_req_queue_iowq(struct io_kiocb *req) ··· 502 535 io_req_task_work_add(req); 503 536 } 504 537 505 - static unsigned io_linked_nr(struct io_kiocb *req) 538 + unsigned io_linked_nr(struct io_kiocb *req) 506 539 { 507 540 struct io_kiocb *tmp; 508 541 unsigned nr = 0; ··· 673 706 tctx->cached_refs += refill; 674 707 } 675 708 676 - static __cold void io_uring_drop_tctx_refs(struct task_struct *task) 709 + __cold void io_uring_drop_tctx_refs(struct task_struct *task) 677 710 { 678 711 struct io_uring_task *tctx = task->io_uring; 679 712 unsigned int refs = tctx->cached_refs; ··· 884 917 } 885 918 886 919 /* 887 - * Must be called from inline task_work so we now a flush will happen later, 920 + * Must be called from inline task_work so we know a flush will happen later, 888 921 * and obviously with ctx->uring_lock held (tw always has that). 889 922 */ 890 923 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) ··· 1116 1149 ctx = req->ctx; 1117 1150 mutex_lock(&ctx->uring_lock); 1118 1151 percpu_ref_get(&ctx->refs); 1152 + ts.cancel = io_should_terminate_tw(ctx); 1119 1153 } 1120 1154 INDIRECT_CALL_2(req->io_task_work.func, 1121 1155 io_poll_task_func, io_req_rw_complete, 1122 - req, ts); 1156 + (struct io_tw_req){req}, ts); 1123 1157 node = next; 1124 1158 (*count)++; 1125 1159 if (unlikely(need_resched())) { ··· 1175 1207 { 1176 1208 struct llist_node *node; 1177 1209 1178 - if (unlikely(current->flags & PF_EXITING)) { 1179 - io_fallback_tw(tctx, true); 1180 - return NULL; 1181 - } 1182 - 1183 1210 node = llist_del_all(&tctx->task_list); 1184 1211 if (node) { 1185 1212 node = llist_reverse_order(node); ··· 1211 1248 BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 1212 1249 1213 1250 /* 1214 - * We don't know how many reuqests is there in the link and whether 1251 + * We don't know how many requests there are in the link and whether 1215 1252 * they can even be queued lazily, fall back to non-lazy. 1216 1253 */ 1217 1254 if (req->flags & IO_REQ_LINK_FLAGS) ··· 1343 1380 io_task_work.node); 1344 1381 INDIRECT_CALL_2(req->io_task_work.func, 1345 1382 io_poll_task_func, io_req_rw_complete, 1346 - req, tw); 1383 + (struct io_tw_req){req}, tw); 1347 1384 *node = next; 1348 1385 if (++ret >= events) 1349 1386 break; ··· 1364 1401 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1365 1402 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1366 1403 again: 1404 + tw.cancel = io_should_terminate_tw(ctx); 1367 1405 min_events -= ret; 1368 1406 ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 1369 1407 if (ctx->retry_llist.first) ··· 1401 1437 max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1402 1438 } 1403 1439 1404 - static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, 1405 - int max_events) 1440 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) 1406 1441 { 1407 1442 struct io_tw_state ts = {}; 1408 1443 int ret; ··· 1412 1449 return ret; 1413 1450 } 1414 1451 1415 - static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) 1452 + static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) 1416 1453 { 1454 + struct io_kiocb *req = tw_req.req; 1455 + 1417 1456 io_tw_lock(req->ctx, tw); 1418 1457 io_req_defer_failed(req, req->cqe.res); 1419 1458 } 1420 1459 1421 - void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) 1460 + void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) 1422 1461 { 1462 + struct io_kiocb *req = tw_req.req; 1423 1463 struct io_ring_ctx *ctx = req->ctx; 1424 1464 1425 1465 io_tw_lock(ctx, tw); 1426 - if (unlikely(io_should_terminate_tw(ctx))) 1466 + if (unlikely(tw.cancel)) 1427 1467 io_req_defer_failed(req, -EFAULT); 1428 1468 else if (req->flags & REQ_F_FORCE_ASYNC) 1429 1469 io_queue_iowq(req); ··· 1555 1589 * We can't just wait for polled events to come to us, we have to actively 1556 1590 * find and complete them. 1557 1591 */ 1558 - static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 1592 + __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 1559 1593 { 1560 1594 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 1561 1595 return; ··· 1658 1692 return 0; 1659 1693 } 1660 1694 1661 - void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) 1695 + void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) 1662 1696 { 1663 - io_req_complete_defer(req); 1697 + io_req_complete_defer(tw_req.req); 1664 1698 } 1665 1699 1666 1700 /* ··· 2119 2153 } 2120 2154 2121 2155 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 2122 - const struct io_uring_sqe *sqe) 2156 + const struct io_uring_sqe *sqe, unsigned int *left) 2123 2157 __must_hold(&ctx->uring_lock) 2124 2158 { 2125 2159 const struct io_issue_def *def; ··· 2145 2179 opcode = array_index_nospec(opcode, IORING_OP_LAST); 2146 2180 2147 2181 def = &io_issue_defs[opcode]; 2182 + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { 2183 + /* 2184 + * A 128b op on a non-128b SQ requires mixed SQE support as 2185 + * well as 2 contiguous entries. 2186 + */ 2187 + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || 2188 + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) 2189 + return io_init_fail_req(req, -EINVAL); 2190 + /* 2191 + * A 128b operation on a mixed SQ uses two entries, so we have 2192 + * to increment the head and cached refs, and decrement what's 2193 + * left. 2194 + */ 2195 + current->io_uring->cached_refs++; 2196 + ctx->cached_sq_head++; 2197 + (*left)--; 2198 + } 2199 + 2148 2200 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 2149 2201 /* enforce forwards compatibility on users */ 2150 2202 if (sqe_flags & ~SQE_VALID_FLAGS) ··· 2272 2288 } 2273 2289 2274 2290 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 2275 - const struct io_uring_sqe *sqe) 2291 + const struct io_uring_sqe *sqe, unsigned int *left) 2276 2292 __must_hold(&ctx->uring_lock) 2277 2293 { 2278 2294 struct io_submit_link *link = &ctx->submit_state.link; 2279 2295 int ret; 2280 2296 2281 - ret = io_init_req(ctx, req, sqe); 2297 + ret = io_init_req(ctx, req, sqe, left); 2282 2298 if (unlikely(ret)) 2283 2299 return io_submit_fail_init(sqe, req, ret); 2284 2300 ··· 2408 2424 unsigned int left; 2409 2425 int ret; 2410 2426 2427 + entries = min(nr, entries); 2411 2428 if (unlikely(!entries)) 2412 2429 return 0; 2413 - /* make sure SQ entry isn't read before tail */ 2414 - ret = left = min(nr, entries); 2430 + 2431 + ret = left = entries; 2415 2432 io_get_task_refs(left); 2416 2433 io_submit_state_start(&ctx->submit_state, left); 2417 2434 ··· 2431 2446 * Continue submitting even for sqe failure if the 2432 2447 * ring was setup with IORING_SETUP_SUBMIT_ALL 2433 2448 */ 2434 - if (unlikely(io_submit_sqe(ctx, req, sqe)) && 2449 + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && 2435 2450 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { 2436 2451 left--; 2437 2452 break; ··· 2753 2768 2754 2769 static void io_rings_free(struct io_ring_ctx *ctx) 2755 2770 { 2756 - io_free_region(ctx, &ctx->sq_region); 2757 - io_free_region(ctx, &ctx->ring_region); 2771 + io_free_region(ctx->user, &ctx->sq_region); 2772 + io_free_region(ctx->user, &ctx->ring_region); 2758 2773 ctx->rings = NULL; 2759 2774 ctx->sq_sqes = NULL; 2760 2775 } 2761 2776 2762 - unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 2763 - unsigned int cq_entries, size_t *sq_offset) 2777 + static int rings_size(unsigned int flags, unsigned int sq_entries, 2778 + unsigned int cq_entries, struct io_rings_layout *rl) 2764 2779 { 2765 2780 struct io_rings *rings; 2766 - size_t off, sq_array_size; 2781 + size_t sqe_size; 2782 + size_t off; 2767 2783 2768 - off = struct_size(rings, cqes, cq_entries); 2769 - if (off == SIZE_MAX) 2770 - return SIZE_MAX; 2771 - if (flags & IORING_SETUP_CQE32) { 2772 - if (check_shl_overflow(off, 1, &off)) 2773 - return SIZE_MAX; 2774 - } 2775 2784 if (flags & IORING_SETUP_CQE_MIXED) { 2776 2785 if (cq_entries < 2) 2777 - return SIZE_MAX; 2786 + return -EOVERFLOW; 2778 2787 } 2788 + if (flags & IORING_SETUP_SQE_MIXED) { 2789 + if (sq_entries < 2) 2790 + return -EOVERFLOW; 2791 + } 2792 + 2793 + rl->sq_array_offset = SIZE_MAX; 2794 + 2795 + sqe_size = sizeof(struct io_uring_sqe); 2796 + if (flags & IORING_SETUP_SQE128) 2797 + sqe_size *= 2; 2798 + 2799 + rl->sq_size = array_size(sqe_size, sq_entries); 2800 + if (rl->sq_size == SIZE_MAX) 2801 + return -EOVERFLOW; 2802 + 2803 + off = struct_size(rings, cqes, cq_entries); 2804 + if (flags & IORING_SETUP_CQE32) 2805 + off = size_mul(off, 2); 2806 + if (off == SIZE_MAX) 2807 + return -EOVERFLOW; 2779 2808 2780 2809 #ifdef CONFIG_SMP 2781 2810 off = ALIGN(off, SMP_CACHE_BYTES); 2782 2811 if (off == 0) 2783 - return SIZE_MAX; 2812 + return -EOVERFLOW; 2784 2813 #endif 2785 2814 2786 - if (flags & IORING_SETUP_NO_SQARRAY) { 2787 - *sq_offset = SIZE_MAX; 2788 - return off; 2815 + if (!(flags & IORING_SETUP_NO_SQARRAY)) { 2816 + size_t sq_array_size; 2817 + 2818 + rl->sq_array_offset = off; 2819 + 2820 + sq_array_size = array_size(sizeof(u32), sq_entries); 2821 + off = size_add(off, sq_array_size); 2822 + if (off == SIZE_MAX) 2823 + return -EOVERFLOW; 2789 2824 } 2790 2825 2791 - *sq_offset = off; 2792 - 2793 - sq_array_size = array_size(sizeof(u32), sq_entries); 2794 - if (sq_array_size == SIZE_MAX) 2795 - return SIZE_MAX; 2796 - 2797 - if (check_add_overflow(off, sq_array_size, &off)) 2798 - return SIZE_MAX; 2799 - 2800 - return off; 2826 + rl->rings_size = off; 2827 + return 0; 2801 2828 } 2802 2829 2803 2830 static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) ··· 2847 2850 io_eventfd_unregister(ctx); 2848 2851 io_free_alloc_caches(ctx); 2849 2852 io_destroy_buffers(ctx); 2850 - io_free_region(ctx, &ctx->param_region); 2853 + io_free_region(ctx->user, &ctx->param_region); 2851 2854 mutex_unlock(&ctx->uring_lock); 2852 2855 if (ctx->sq_creds) 2853 2856 put_cred(ctx->sq_creds); ··· 2977 2980 complete(&work->completion); 2978 2981 } 2979 2982 2980 - static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 2981 - { 2982 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 2983 - 2984 - return req->ctx == data; 2985 - } 2986 - 2987 2983 static __cold void io_ring_exit_work(struct work_struct *work) 2988 2984 { 2989 2985 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); ··· 2996 3006 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { 2997 3007 mutex_lock(&ctx->uring_lock); 2998 3008 io_cqring_overflow_kill(ctx); 2999 - mutex_unlock(&ctx->uring_lock); 3000 - } 3001 - if (!xa_empty(&ctx->zcrx_ctxs)) { 3002 - mutex_lock(&ctx->uring_lock); 3003 - io_shutdown_zcrx_ifqs(ctx); 3004 3009 mutex_unlock(&ctx->uring_lock); 3005 3010 } 3006 3011 ··· 3103 3118 file->private_data = NULL; 3104 3119 io_ring_ctx_wait_and_kill(ctx); 3105 3120 return 0; 3106 - } 3107 - 3108 - struct io_task_cancel { 3109 - struct io_uring_task *tctx; 3110 - bool all; 3111 - }; 3112 - 3113 - static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 3114 - { 3115 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 3116 - struct io_task_cancel *cancel = data; 3117 - 3118 - return io_match_task_safe(req, cancel->tctx, cancel->all); 3119 - } 3120 - 3121 - static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 3122 - struct io_uring_task *tctx, 3123 - bool cancel_all) 3124 - { 3125 - struct io_defer_entry *de; 3126 - LIST_HEAD(list); 3127 - 3128 - list_for_each_entry_reverse(de, &ctx->defer_list, list) { 3129 - if (io_match_task_safe(de->req, tctx, cancel_all)) { 3130 - list_cut_position(&list, &ctx->defer_list, &de->list); 3131 - break; 3132 - } 3133 - } 3134 - if (list_empty(&list)) 3135 - return false; 3136 - 3137 - while (!list_empty(&list)) { 3138 - de = list_first_entry(&list, struct io_defer_entry, list); 3139 - list_del_init(&de->list); 3140 - ctx->nr_drained -= io_linked_nr(de->req); 3141 - io_req_task_queue_fail(de->req, -ECANCELED); 3142 - kfree(de); 3143 - } 3144 - return true; 3145 - } 3146 - 3147 - static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 3148 - { 3149 - struct io_tctx_node *node; 3150 - enum io_wq_cancel cret; 3151 - bool ret = false; 3152 - 3153 - mutex_lock(&ctx->uring_lock); 3154 - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 3155 - struct io_uring_task *tctx = node->task->io_uring; 3156 - 3157 - /* 3158 - * io_wq will stay alive while we hold uring_lock, because it's 3159 - * killed after ctx nodes, which requires to take the lock. 3160 - */ 3161 - if (!tctx || !tctx->io_wq) 3162 - continue; 3163 - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 3164 - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 3165 - } 3166 - mutex_unlock(&ctx->uring_lock); 3167 - 3168 - return ret; 3169 - } 3170 - 3171 - static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 3172 - struct io_uring_task *tctx, 3173 - bool cancel_all, 3174 - bool is_sqpoll_thread) 3175 - { 3176 - struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; 3177 - enum io_wq_cancel cret; 3178 - bool ret = false; 3179 - 3180 - /* set it so io_req_local_work_add() would wake us up */ 3181 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 3182 - atomic_set(&ctx->cq_wait_nr, 1); 3183 - smp_mb(); 3184 - } 3185 - 3186 - /* failed during ring init, it couldn't have issued any requests */ 3187 - if (!ctx->rings) 3188 - return false; 3189 - 3190 - if (!tctx) { 3191 - ret |= io_uring_try_cancel_iowq(ctx); 3192 - } else if (tctx->io_wq) { 3193 - /* 3194 - * Cancels requests of all rings, not only @ctx, but 3195 - * it's fine as the task is in exit/exec. 3196 - */ 3197 - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 3198 - &cancel, true); 3199 - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 3200 - } 3201 - 3202 - /* SQPOLL thread does its own polling */ 3203 - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 3204 - is_sqpoll_thread) { 3205 - while (!wq_list_empty(&ctx->iopoll_list)) { 3206 - io_iopoll_try_reap_events(ctx); 3207 - ret = true; 3208 - cond_resched(); 3209 - } 3210 - } 3211 - 3212 - if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3213 - io_allowed_defer_tw_run(ctx)) 3214 - ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; 3215 - mutex_lock(&ctx->uring_lock); 3216 - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 3217 - ret |= io_poll_remove_all(ctx, tctx, cancel_all); 3218 - ret |= io_waitid_remove_all(ctx, tctx, cancel_all); 3219 - ret |= io_futex_remove_all(ctx, tctx, cancel_all); 3220 - ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); 3221 - mutex_unlock(&ctx->uring_lock); 3222 - ret |= io_kill_timeouts(ctx, tctx, cancel_all); 3223 - if (tctx) 3224 - ret |= io_run_task_work() > 0; 3225 - else 3226 - ret |= flush_delayed_work(&ctx->fallback_work); 3227 - return ret; 3228 - } 3229 - 3230 - static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 3231 - { 3232 - if (tracked) 3233 - return atomic_read(&tctx->inflight_tracked); 3234 - return percpu_counter_sum(&tctx->inflight); 3235 - } 3236 - 3237 - /* 3238 - * Find any io_uring ctx that this task has registered or done IO on, and cancel 3239 - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 3240 - */ 3241 - __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 3242 - { 3243 - struct io_uring_task *tctx = current->io_uring; 3244 - struct io_ring_ctx *ctx; 3245 - struct io_tctx_node *node; 3246 - unsigned long index; 3247 - s64 inflight; 3248 - DEFINE_WAIT(wait); 3249 - 3250 - WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); 3251 - 3252 - if (!current->io_uring) 3253 - return; 3254 - if (tctx->io_wq) 3255 - io_wq_exit_start(tctx->io_wq); 3256 - 3257 - atomic_inc(&tctx->in_cancel); 3258 - do { 3259 - bool loop = false; 3260 - 3261 - io_uring_drop_tctx_refs(current); 3262 - if (!tctx_inflight(tctx, !cancel_all)) 3263 - break; 3264 - 3265 - /* read completions before cancelations */ 3266 - inflight = tctx_inflight(tctx, false); 3267 - if (!inflight) 3268 - break; 3269 - 3270 - if (!sqd) { 3271 - xa_for_each(&tctx->xa, index, node) { 3272 - /* sqpoll task will cancel all its requests */ 3273 - if (node->ctx->sq_data) 3274 - continue; 3275 - loop |= io_uring_try_cancel_requests(node->ctx, 3276 - current->io_uring, 3277 - cancel_all, 3278 - false); 3279 - } 3280 - } else { 3281 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 3282 - loop |= io_uring_try_cancel_requests(ctx, 3283 - current->io_uring, 3284 - cancel_all, 3285 - true); 3286 - } 3287 - 3288 - if (loop) { 3289 - cond_resched(); 3290 - continue; 3291 - } 3292 - 3293 - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 3294 - io_run_task_work(); 3295 - io_uring_drop_tctx_refs(current); 3296 - xa_for_each(&tctx->xa, index, node) { 3297 - if (io_local_work_pending(node->ctx)) { 3298 - WARN_ON_ONCE(node->ctx->submitter_task && 3299 - node->ctx->submitter_task != current); 3300 - goto end_wait; 3301 - } 3302 - } 3303 - /* 3304 - * If we've seen completions, retry without waiting. This 3305 - * avoids a race where a completion comes in before we did 3306 - * prepare_to_wait(). 3307 - */ 3308 - if (inflight == tctx_inflight(tctx, !cancel_all)) 3309 - schedule(); 3310 - end_wait: 3311 - finish_wait(&tctx->wait, &wait); 3312 - } while (1); 3313 - 3314 - io_uring_clean_tctx(tctx); 3315 - if (cancel_all) { 3316 - /* 3317 - * We shouldn't run task_works after cancel, so just leave 3318 - * ->in_cancel set for normal exit. 3319 - */ 3320 - atomic_dec(&tctx->in_cancel); 3321 - /* for exec all current's requests should be gone, kill tctx */ 3322 - __io_uring_free(current); 3323 - } 3324 - } 3325 - 3326 - void __io_uring_cancel(bool cancel_all) 3327 - { 3328 - io_uring_unreg_ringfd(); 3329 - io_uring_cancel_generic(cancel_all, NULL); 3330 3121 } 3331 3122 3332 3123 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, ··· 3355 3594 } 3356 3595 3357 3596 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, 3358 - struct io_uring_params *p) 3597 + struct io_ctx_config *config) 3359 3598 { 3599 + struct io_uring_params *p = &config->p; 3600 + struct io_rings_layout *rl = &config->layout; 3360 3601 struct io_uring_region_desc rd; 3361 3602 struct io_rings *rings; 3362 - size_t size, sq_array_offset; 3363 3603 int ret; 3364 3604 3365 3605 /* make sure these are sane, as we already accounted them */ 3366 3606 ctx->sq_entries = p->sq_entries; 3367 3607 ctx->cq_entries = p->cq_entries; 3368 3608 3369 - size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, 3370 - &sq_array_offset); 3371 - if (size == SIZE_MAX) 3372 - return -EOVERFLOW; 3373 - 3374 3609 memset(&rd, 0, sizeof(rd)); 3375 - rd.size = PAGE_ALIGN(size); 3610 + rd.size = PAGE_ALIGN(rl->rings_size); 3376 3611 if (ctx->flags & IORING_SETUP_NO_MMAP) { 3377 3612 rd.user_addr = p->cq_off.user_addr; 3378 3613 rd.flags |= IORING_MEM_REGION_TYPE_USER; ··· 3377 3620 if (ret) 3378 3621 return ret; 3379 3622 ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); 3380 - 3381 3623 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3382 - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3383 - rings->sq_ring_mask = p->sq_entries - 1; 3384 - rings->cq_ring_mask = p->cq_entries - 1; 3385 - rings->sq_ring_entries = p->sq_entries; 3386 - rings->cq_ring_entries = p->cq_entries; 3387 - 3388 - if (p->flags & IORING_SETUP_SQE128) 3389 - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); 3390 - else 3391 - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 3392 - if (size == SIZE_MAX) { 3393 - io_rings_free(ctx); 3394 - return -EOVERFLOW; 3395 - } 3624 + ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); 3396 3625 3397 3626 memset(&rd, 0, sizeof(rd)); 3398 - rd.size = PAGE_ALIGN(size); 3627 + rd.size = PAGE_ALIGN(rl->sq_size); 3399 3628 if (ctx->flags & IORING_SETUP_NO_MMAP) { 3400 3629 rd.user_addr = p->sq_off.user_addr; 3401 3630 rd.flags |= IORING_MEM_REGION_TYPE_USER; ··· 3392 3649 return ret; 3393 3650 } 3394 3651 ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); 3652 + 3653 + memset(rings, 0, sizeof(*rings)); 3654 + WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); 3655 + WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); 3656 + WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); 3657 + WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); 3395 3658 return 0; 3396 3659 } 3397 3660 ··· 3427 3678 static int io_uring_sanitise_params(struct io_uring_params *p) 3428 3679 { 3429 3680 unsigned flags = p->flags; 3681 + 3682 + if (flags & ~IORING_SETUP_FLAGS) 3683 + return -EINVAL; 3430 3684 3431 3685 /* There is no way to mmap rings without a real fd */ 3432 3686 if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && ··· 3469 3717 if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3470 3718 (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3471 3719 return -EINVAL; 3720 + /* 3721 + * Nonsensical to ask for SQE128 and mixed SQE support, it's not 3722 + * supported to post 64b SQEs on a ring setup with SQE128. 3723 + */ 3724 + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == 3725 + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) 3726 + return -EINVAL; 3472 3727 3473 3728 return 0; 3474 3729 } 3475 3730 3476 - int io_uring_fill_params(unsigned entries, struct io_uring_params *p) 3731 + static int io_uring_fill_params(struct io_uring_params *p) 3477 3732 { 3733 + unsigned entries = p->sq_entries; 3734 + 3478 3735 if (!entries) 3479 3736 return -EINVAL; 3480 3737 if (entries > IORING_MAX_ENTRIES) { ··· 3521 3760 p->cq_entries = 2 * p->sq_entries; 3522 3761 } 3523 3762 3763 + return 0; 3764 + } 3765 + 3766 + int io_prepare_config(struct io_ctx_config *config) 3767 + { 3768 + struct io_uring_params *p = &config->p; 3769 + int ret; 3770 + 3771 + ret = io_uring_sanitise_params(p); 3772 + if (ret) 3773 + return ret; 3774 + 3775 + ret = io_uring_fill_params(p); 3776 + if (ret) 3777 + return ret; 3778 + 3779 + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, 3780 + &config->layout); 3781 + if (ret) 3782 + return ret; 3783 + 3524 3784 p->sq_off.head = offsetof(struct io_rings, sq.head); 3525 3785 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 3526 3786 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); ··· 3562 3780 p->cq_off.resv1 = 0; 3563 3781 if (!(p->flags & IORING_SETUP_NO_MMAP)) 3564 3782 p->cq_off.user_addr = 0; 3783 + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) 3784 + p->sq_off.array = config->layout.sq_array_offset; 3565 3785 3566 3786 return 0; 3567 3787 } 3568 3788 3569 - static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, 3570 - struct io_uring_params __user *params) 3789 + static __cold int io_uring_create(struct io_ctx_config *config) 3571 3790 { 3791 + struct io_uring_params *p = &config->p; 3572 3792 struct io_ring_ctx *ctx; 3573 3793 struct io_uring_task *tctx; 3574 3794 struct file *file; 3575 3795 int ret; 3576 3796 3577 - ret = io_uring_sanitise_params(p); 3797 + ret = io_prepare_config(config); 3578 3798 if (ret) 3579 - return ret; 3580 - 3581 - ret = io_uring_fill_params(entries, p); 3582 - if (unlikely(ret)) 3583 3799 return ret; 3584 3800 3585 3801 ctx = io_ring_ctx_alloc(p); ··· 3637 3857 mmgrab(current->mm); 3638 3858 ctx->mm_account = current->mm; 3639 3859 3640 - ret = io_allocate_scq_urings(ctx, p); 3860 + ret = io_allocate_scq_urings(ctx, config); 3641 3861 if (ret) 3642 3862 goto err; 3643 - 3644 - if (!(p->flags & IORING_SETUP_NO_SQARRAY)) 3645 - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 3646 3863 3647 3864 ret = io_sq_offload_create(ctx, p); 3648 3865 if (ret) ··· 3647 3870 3648 3871 p->features = IORING_FEAT_FLAGS; 3649 3872 3650 - if (copy_to_user(params, p, sizeof(*p))) { 3873 + if (copy_to_user(config->uptr, p, sizeof(*p))) { 3651 3874 ret = -EFAULT; 3652 3875 goto err; 3653 3876 } ··· 3700 3923 */ 3701 3924 static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 3702 3925 { 3703 - struct io_uring_params p; 3704 - int i; 3926 + struct io_ctx_config config; 3705 3927 3706 - if (copy_from_user(&p, params, sizeof(p))) 3928 + memset(&config, 0, sizeof(config)); 3929 + 3930 + if (copy_from_user(&config.p, params, sizeof(config.p))) 3707 3931 return -EFAULT; 3708 - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 3709 - if (p.resv[i]) 3710 - return -EINVAL; 3711 - } 3712 3932 3713 - if (p.flags & ~IORING_SETUP_FLAGS) 3933 + if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) 3714 3934 return -EINVAL; 3715 - return io_uring_create(entries, &p, params); 3935 + 3936 + config.p.sq_entries = entries; 3937 + config.uptr = params; 3938 + return io_uring_create(&config); 3716 3939 } 3717 3940 3718 3941 static inline int io_uring_allowed(void)
+29 -34
io_uring/io_uring.h
··· 17 17 #include <trace/events/io_uring.h> 18 18 #endif 19 19 20 + struct io_rings_layout { 21 + /* size of CQ + headers + SQ offset array */ 22 + size_t rings_size; 23 + size_t sq_size; 24 + 25 + size_t sq_array_offset; 26 + }; 27 + 28 + struct io_ctx_config { 29 + struct io_uring_params p; 30 + struct io_rings_layout layout; 31 + struct io_uring_params __user *uptr; 32 + }; 33 + 20 34 #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ 21 35 IORING_FEAT_NODROP |\ 22 36 IORING_FEAT_SUBMIT_STABLE |\ ··· 68 54 IORING_SETUP_REGISTERED_FD_ONLY |\ 69 55 IORING_SETUP_NO_SQARRAY |\ 70 56 IORING_SETUP_HYBRID_IOPOLL |\ 71 - IORING_SETUP_CQE_MIXED) 57 + IORING_SETUP_CQE_MIXED |\ 58 + IORING_SETUP_SQE_MIXED) 72 59 73 60 #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ 74 61 IORING_ENTER_SQ_WAKEUP |\ ··· 110 95 IOU_REQUEUE = -3072, 111 96 }; 112 97 98 + struct io_defer_entry { 99 + struct list_head list; 100 + struct io_kiocb *req; 101 + }; 102 + 113 103 struct io_wait_queue { 114 104 struct wait_queue_entry wq; 115 105 struct io_ring_ctx *ctx; ··· 148 128 #define IORING_MAX_ENTRIES 32768 149 129 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 150 130 151 - unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 152 - unsigned int cq_entries, size_t *sq_offset); 153 - int io_uring_fill_params(unsigned entries, struct io_uring_params *p); 131 + int io_prepare_config(struct io_ctx_config *config); 132 + 154 133 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 155 134 int io_run_task_work_sig(struct io_ring_ctx *ctx); 135 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); 156 136 void io_req_defer_failed(struct io_kiocb *req, s32 res); 157 137 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 158 138 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 160 140 bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); 161 141 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 162 142 143 + unsigned io_linked_nr(struct io_kiocb *req); 163 144 void io_req_track_inflight(struct io_kiocb *req); 164 145 struct file *io_file_get_normal(struct io_kiocb *req, int fd); 165 146 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, ··· 169 148 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 170 149 void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); 171 150 void io_req_task_queue(struct io_kiocb *req); 172 - void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); 151 + void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); 173 152 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 174 - void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); 153 + void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); 175 154 struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 176 155 struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 177 156 void tctx_task_work(struct callback_head *cb); 178 - __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 157 + __cold void io_uring_drop_tctx_refs(struct task_struct *task); 179 158 180 159 int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, 181 160 int start, int end); ··· 184 163 int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); 185 164 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 186 165 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 166 + __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx); 187 167 void __io_submit_flush_completions(struct io_ring_ctx *ctx); 188 168 189 169 struct io_wq_work *io_wq_free_work(struct io_wq_work *work); ··· 194 172 void io_queue_next(struct io_kiocb *req); 195 173 void io_task_refs_refill(struct io_uring_task *tctx); 196 174 bool __io_alloc_req_refill(struct io_ring_ctx *ctx); 197 - 198 - bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 199 - bool cancel_all); 200 175 201 176 void io_activate_pollwq(struct io_ring_ctx *ctx); 202 177 ··· 577 558 ctx->submitter_task == current); 578 559 } 579 560 580 - /* 581 - * Terminate the request if either of these conditions are true: 582 - * 583 - * 1) It's being executed by the original task, but that task is marked 584 - * with PF_EXITING as it's exiting. 585 - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 586 - * our fallback task_work. 587 - */ 588 - static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 589 - { 590 - return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); 591 - } 592 - 593 561 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) 594 562 { 595 563 io_req_set_res(req, res, 0); 596 564 req->io_task_work.func = io_req_task_complete; 597 565 io_req_task_work_add(req); 598 - } 599 - 600 - /* 601 - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 602 - * slot. 603 - */ 604 - static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) 605 - { 606 - if (ctx->flags & IORING_SETUP_SQE128) 607 - return 2 * sizeof(struct io_uring_sqe); 608 - return sizeof(struct io_uring_sqe); 609 566 } 610 567 611 568 static inline bool io_file_can_poll(struct io_kiocb *req)
+3 -3
io_uring/kbuf.c
··· 428 428 static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 429 429 { 430 430 if (bl->flags & IOBL_BUF_RING) 431 - io_free_region(ctx, &bl->region); 431 + io_free_region(ctx->user, &bl->region); 432 432 else 433 433 io_remove_buffers_legacy(ctx, bl, -1U); 434 434 ··· 641 641 rd.user_addr = reg.ring_addr; 642 642 rd.flags |= IORING_MEM_REGION_TYPE_USER; 643 643 } 644 - ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); 644 + ret = io_create_region(ctx, &bl->region, &rd, mmap_offset); 645 645 if (ret) 646 646 goto fail; 647 647 br = io_region_get_ptr(&bl->region); ··· 672 672 io_buffer_add_list(ctx, bl, reg.bgid); 673 673 return 0; 674 674 fail: 675 - io_free_region(ctx, &bl->region); 675 + io_free_region(ctx->user, &bl->region); 676 676 kfree(bl); 677 677 return ret; 678 678 }
+2 -3
io_uring/kbuf.h
··· 14 14 15 15 struct io_buffer_list { 16 16 /* 17 - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, 18 - * then these are classic provided buffers and ->buf_list is used. 17 + * If the IOBL_BUF_RING flag is set, then buf_ring is used. If not, then 18 + * these are classic provided buffers and ->buf_list is used. 19 19 */ 20 20 union { 21 21 struct list_head buf_list; ··· 27 27 __u16 bgid; 28 28 29 29 /* below is for ring provided buffers */ 30 - __u16 buf_nr_pages; 31 30 __u16 nr_entries; 32 31 __u16 head; 33 32 __u16 mask;
+17 -42
io_uring/memmap.c
··· 15 15 #include "rsrc.h" 16 16 #include "zcrx.h" 17 17 18 - static void *io_mem_alloc_compound(struct page **pages, int nr_pages, 19 - size_t size, gfp_t gfp) 18 + static bool io_mem_alloc_compound(struct page **pages, int nr_pages, 19 + size_t size, gfp_t gfp) 20 20 { 21 21 struct page *page; 22 22 int i, order; 23 23 24 24 order = get_order(size); 25 25 if (order > MAX_PAGE_ORDER) 26 - return ERR_PTR(-ENOMEM); 26 + return false; 27 27 else if (order) 28 28 gfp |= __GFP_COMP; 29 29 30 30 page = alloc_pages(gfp, order); 31 31 if (!page) 32 - return ERR_PTR(-ENOMEM); 32 + return false; 33 33 34 34 for (i = 0; i < nr_pages; i++) 35 35 pages[i] = page + i; 36 36 37 - return page_address(page); 37 + return true; 38 38 } 39 39 40 40 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) ··· 88 88 IO_REGION_F_SINGLE_REF = 4, 89 89 }; 90 90 91 - void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) 91 + void io_free_region(struct user_struct *user, struct io_mapped_region *mr) 92 92 { 93 93 if (mr->pages) { 94 94 long nr_refs = mr->nr_pages; ··· 105 105 } 106 106 if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) 107 107 vunmap(mr->ptr); 108 - if (mr->nr_pages && ctx->user) 109 - __io_unaccount_mem(ctx->user, mr->nr_pages); 108 + if (mr->nr_pages && user) 109 + __io_unaccount_mem(user, mr->nr_pages); 110 110 111 111 memset(mr, 0, sizeof(*mr)); 112 112 } ··· 131 131 return 0; 132 132 } 133 133 134 - static int io_region_pin_pages(struct io_ring_ctx *ctx, 135 - struct io_mapped_region *mr, 136 - struct io_uring_region_desc *reg) 134 + static int io_region_pin_pages(struct io_mapped_region *mr, 135 + struct io_uring_region_desc *reg) 137 136 { 138 - unsigned long size = (size_t) mr->nr_pages << PAGE_SHIFT; 137 + size_t size = io_region_size(mr); 139 138 struct page **pages; 140 139 int nr_pages; 141 140 ··· 149 150 return 0; 150 151 } 151 152 152 - static int io_region_allocate_pages(struct io_ring_ctx *ctx, 153 - struct io_mapped_region *mr, 153 + static int io_region_allocate_pages(struct io_mapped_region *mr, 154 154 struct io_uring_region_desc *reg, 155 155 unsigned long mmap_offset) 156 156 { 157 157 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 158 - size_t size = (size_t) mr->nr_pages << PAGE_SHIFT; 158 + size_t size = io_region_size(mr); 159 159 unsigned long nr_allocated; 160 160 struct page **pages; 161 - void *p; 162 161 163 162 pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); 164 163 if (!pages) 165 164 return -ENOMEM; 166 165 167 - p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); 168 - if (!IS_ERR(p)) { 166 + if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) { 169 167 mr->flags |= IO_REGION_F_SINGLE_REF; 170 168 goto done; 171 169 } ··· 215 219 mr->nr_pages = nr_pages; 216 220 217 221 if (reg->flags & IORING_MEM_REGION_TYPE_USER) 218 - ret = io_region_pin_pages(ctx, mr, reg); 222 + ret = io_region_pin_pages(mr, reg); 219 223 else 220 - ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); 224 + ret = io_region_allocate_pages(mr, reg, mmap_offset); 221 225 if (ret) 222 226 goto out_free; 223 227 ··· 226 230 goto out_free; 227 231 return 0; 228 232 out_free: 229 - io_free_region(ctx, mr); 233 + io_free_region(ctx->user, mr); 230 234 return ret; 231 - } 232 - 233 - int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 234 - struct io_uring_region_desc *reg, 235 - unsigned long mmap_offset) 236 - { 237 - struct io_mapped_region tmp_mr; 238 - int ret; 239 - 240 - memcpy(&tmp_mr, mr, sizeof(tmp_mr)); 241 - ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); 242 - if (ret) 243 - return ret; 244 - 245 - /* 246 - * Once published mmap can find it without holding only the ->mmap_lock 247 - * and not ->uring_lock. 248 - */ 249 - guard(mutex)(&ctx->mmap_lock); 250 - memcpy(mr, &tmp_mr, sizeof(tmp_mr)); 251 - return 0; 252 235 } 253 236 254 237 static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,
+18 -6
io_uring/memmap.h
··· 16 16 unsigned long flags); 17 17 int io_uring_mmap(struct file *file, struct vm_area_struct *vma); 18 18 19 - void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); 19 + void io_free_region(struct user_struct *user, struct io_mapped_region *mr); 20 20 int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 21 21 struct io_uring_region_desc *reg, 22 22 unsigned long mmap_offset); 23 - 24 - int io_create_region_mmap_safe(struct io_ring_ctx *ctx, 25 - struct io_mapped_region *mr, 26 - struct io_uring_region_desc *reg, 27 - unsigned long mmap_offset); 28 23 29 24 static inline void *io_region_get_ptr(struct io_mapped_region *mr) 30 25 { ··· 29 34 static inline bool io_region_is_set(struct io_mapped_region *mr) 30 35 { 31 36 return !!mr->nr_pages; 37 + } 38 + 39 + static inline void io_region_publish(struct io_ring_ctx *ctx, 40 + struct io_mapped_region *src_region, 41 + struct io_mapped_region *dst_region) 42 + { 43 + /* 44 + * Once published mmap can find it without holding only the ->mmap_lock 45 + * and not ->uring_lock. 46 + */ 47 + guard(mutex)(&ctx->mmap_lock); 48 + *dst_region = *src_region; 49 + } 50 + 51 + static inline size_t io_region_size(struct io_mapped_region *mr) 52 + { 53 + return (size_t) mr->nr_pages << PAGE_SHIFT; 32 54 } 33 55 34 56 #endif
+2 -1
io_uring/msg_ring.c
··· 70 70 return target_ctx->task_complete; 71 71 } 72 72 73 - static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) 73 + static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) 74 74 { 75 + struct io_kiocb *req = tw_req.req; 75 76 struct io_ring_ctx *ctx = req->ctx; 76 77 77 78 io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
+2 -5
io_uring/net.c
··· 110 110 111 111 struct io_recvzc { 112 112 struct file *file; 113 - unsigned msg_flags; 114 113 u16 flags; 115 114 u32 len; 116 115 struct io_zcrx_ifq *ifq; ··· 1252 1253 1253 1254 zc->len = READ_ONCE(sqe->len); 1254 1255 zc->flags = READ_ONCE(sqe->ioprio); 1255 - zc->msg_flags = READ_ONCE(sqe->msg_flags); 1256 - if (zc->msg_flags) 1256 + if (READ_ONCE(sqe->msg_flags)) 1257 1257 return -EINVAL; 1258 1258 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1259 1259 return -EINVAL; ··· 1281 1283 return -ENOTSOCK; 1282 1284 1283 1285 len = zc->len; 1284 - ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1285 - issue_flags, &zc->len); 1286 + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); 1286 1287 if (len && zc->len == 0) { 1287 1288 io_req_set_res(req, 0, 0); 1288 1289
+4 -3
io_uring/notif.c
··· 11 11 12 12 static const struct ubuf_info_ops io_ubuf_ops; 13 13 14 - static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) 14 + static void io_notif_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) 15 15 { 16 + struct io_kiocb *notif = tw_req.req; 16 17 struct io_notif_data *nd = io_notif_to_data(notif); 17 18 struct io_ring_ctx *ctx = notif->ctx; 18 19 ··· 35 34 } 36 35 37 36 nd = nd->next; 38 - io_req_task_complete(notif, tw); 37 + io_req_task_complete((struct io_tw_req){notif}, tw); 39 38 } while (nd); 40 39 } 41 40 ··· 93 92 prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); 94 93 prev_notif = cmd_to_io_kiocb(prev_nd); 95 94 96 - /* make sure all noifications can be finished in the same task_work */ 95 + /* make sure all notifications can be finished in the same task_work */ 97 96 if (unlikely(notif->ctx != prev_notif->ctx || 98 97 notif->tctx != prev_notif->tctx)) 99 98 return -EEXIST;
+26
io_uring/opdef.c
··· 575 575 .prep = io_pipe_prep, 576 576 .issue = io_pipe, 577 577 }, 578 + [IORING_OP_NOP128] = { 579 + .audit_skip = 1, 580 + .iopoll = 1, 581 + .is_128 = 1, 582 + .prep = io_nop_prep, 583 + .issue = io_nop, 584 + }, 585 + [IORING_OP_URING_CMD128] = { 586 + .buffer_select = 1, 587 + .needs_file = 1, 588 + .plug = 1, 589 + .iopoll = 1, 590 + .iopoll_queue = 1, 591 + .is_128 = 1, 592 + .async_size = sizeof(struct io_async_cmd), 593 + .prep = io_uring_cmd_prep, 594 + .issue = io_uring_cmd, 595 + }, 578 596 }; 579 597 580 598 const struct io_cold_def io_cold_defs[] = { ··· 842 824 }, 843 825 [IORING_OP_PIPE] = { 844 826 .name = "PIPE", 827 + }, 828 + [IORING_OP_NOP128] = { 829 + .name = "NOP128", 830 + }, 831 + [IORING_OP_URING_CMD128] = { 832 + .name = "URING_CMD128", 833 + .sqe_copy = io_uring_cmd_sqe_copy, 834 + .cleanup = io_uring_cmd_cleanup, 845 835 }, 846 836 }; 847 837
+2
io_uring/opdef.h
··· 27 27 unsigned iopoll_queue : 1; 28 28 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ 29 29 unsigned vectored : 1; 30 + /* set to 1 if this opcode uses 128b sqes in a mixed sq */ 31 + unsigned is_128 : 1; 30 32 31 33 /* size of async data needed, if any */ 32 34 unsigned short async_size;
+7 -6
io_uring/poll.c
··· 224 224 { 225 225 int v; 226 226 227 - if (unlikely(io_should_terminate_tw(req->ctx))) 227 + if (unlikely(tw.cancel)) 228 228 return -ECANCELED; 229 229 230 230 do { ··· 310 310 return IOU_POLL_NO_ACTION; 311 311 } 312 312 313 - void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) 313 + void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw) 314 314 { 315 + struct io_kiocb *req = tw_req.req; 315 316 int ret; 316 317 317 318 ret = io_poll_check_events(req, tw); ··· 333 332 poll = io_kiocb_to_cmd(req, struct io_poll); 334 333 req->cqe.res = mangle_poll(req->cqe.res & poll->events); 335 334 } else if (ret == IOU_POLL_REISSUE) { 336 - io_req_task_submit(req, tw); 335 + io_req_task_submit(tw_req, tw); 337 336 return; 338 337 } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { 339 338 req->cqe.res = ret; ··· 341 340 } 342 341 343 342 io_req_set_res(req, req->cqe.res, 0); 344 - io_req_task_complete(req, tw); 343 + io_req_task_complete(tw_req, tw); 345 344 } else { 346 345 io_tw_lock(req->ctx, tw); 347 346 348 347 if (ret == IOU_POLL_REMOVE_POLL_USE_RES) 349 - io_req_task_complete(req, tw); 348 + io_req_task_complete(tw_req, tw); 350 349 else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) 351 - io_req_task_submit(req, tw); 350 + io_req_task_submit(tw_req, tw); 352 351 else 353 352 io_req_defer_failed(req, ret); 354 353 }
+1 -1
io_uring/poll.h
··· 46 46 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 47 47 bool cancel_all); 48 48 49 - void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); 49 + void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw);
+44 -11
io_uring/query.c
··· 4 4 5 5 #include "query.h" 6 6 #include "io_uring.h" 7 + #include "zcrx.h" 7 8 8 - #define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) 9 + union io_query_data { 10 + struct io_uring_query_opcode opcodes; 11 + struct io_uring_query_zcrx zcrx; 12 + struct io_uring_query_scq scq; 13 + }; 14 + 15 + #define IO_MAX_QUERY_SIZE sizeof(union io_query_data) 9 16 #define IO_MAX_QUERY_ENTRIES 1000 10 17 11 - static ssize_t io_query_ops(void *data) 18 + static ssize_t io_query_ops(union io_query_data *data) 12 19 { 13 - struct io_uring_query_opcode *e = data; 14 - 15 - BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); 20 + struct io_uring_query_opcode *e = &data->opcodes; 16 21 17 22 e->nr_request_opcodes = IORING_OP_LAST; 18 23 e->nr_register_opcodes = IORING_REGISTER_LAST; ··· 30 25 return sizeof(*e); 31 26 } 32 27 33 - static int io_handle_query_entry(struct io_ring_ctx *ctx, 34 - void *data, void __user *uhdr, 28 + static ssize_t io_query_zcrx(union io_query_data *data) 29 + { 30 + struct io_uring_query_zcrx *e = &data->zcrx; 31 + 32 + e->register_flags = ZCRX_REG_IMPORT; 33 + e->area_flags = IORING_ZCRX_AREA_DMABUF; 34 + e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; 35 + e->rq_hdr_size = sizeof(struct io_uring); 36 + e->rq_hdr_alignment = L1_CACHE_BYTES; 37 + e->__resv1 = 0; 38 + e->__resv2 = 0; 39 + return sizeof(*e); 40 + } 41 + 42 + static ssize_t io_query_scq(union io_query_data *data) 43 + { 44 + struct io_uring_query_scq *e = &data->scq; 45 + 46 + e->hdr_size = sizeof(struct io_rings); 47 + e->hdr_alignment = SMP_CACHE_BYTES; 48 + return sizeof(*e); 49 + } 50 + 51 + static int io_handle_query_entry(union io_query_data *data, void __user *uhdr, 35 52 u64 *next_entry) 36 53 { 37 54 struct io_uring_query_hdr hdr; ··· 80 53 case IO_URING_QUERY_OPCODES: 81 54 ret = io_query_ops(data); 82 55 break; 56 + case IO_URING_QUERY_ZCRX: 57 + ret = io_query_zcrx(data); 58 + break; 59 + case IO_URING_QUERY_SCQ: 60 + ret = io_query_scq(data); 61 + break; 83 62 } 84 63 85 64 if (ret >= 0) { ··· 106 73 return 0; 107 74 } 108 75 109 - int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 76 + int io_query(void __user *arg, unsigned nr_args) 110 77 { 111 - char entry_buffer[IO_MAX_QUERY_SIZE]; 78 + union io_query_data entry_buffer; 112 79 void __user *uhdr = arg; 113 80 int ret, nr = 0; 114 81 115 - memset(entry_buffer, 0, sizeof(entry_buffer)); 82 + memset(&entry_buffer, 0, sizeof(entry_buffer)); 116 83 117 84 if (nr_args) 118 85 return -EINVAL; ··· 120 87 while (uhdr) { 121 88 u64 next_hdr; 122 89 123 - ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); 90 + ret = io_handle_query_entry(&entry_buffer, uhdr, &next_hdr); 124 91 if (ret) 125 92 return ret; 126 93 uhdr = u64_to_user_ptr(next_hdr);
+1 -1
io_uring/query.h
··· 4 4 5 5 #include <linux/io_uring_types.h> 6 6 7 - int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); 7 + int io_query(void __user *arg, unsigned nr_args); 8 8 9 9 #endif
+48 -57
io_uring/register.c
··· 379 379 }; 380 380 381 381 static void io_register_free_rings(struct io_ring_ctx *ctx, 382 - struct io_uring_params *p, 383 382 struct io_ring_ctx_rings *r) 384 383 { 385 - io_free_region(ctx, &r->sq_region); 386 - io_free_region(ctx, &r->ring_region); 384 + io_free_region(ctx->user, &r->sq_region); 385 + io_free_region(ctx->user, &r->ring_region); 387 386 } 388 387 389 388 #define swap_old(ctx, o, n, field) \ ··· 394 395 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 395 396 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 396 397 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 397 - IORING_SETUP_CQE_MIXED) 398 + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) 398 399 399 400 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 400 401 { 402 + struct io_ctx_config config; 401 403 struct io_uring_region_desc rd; 402 404 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 403 - size_t size, sq_array_offset; 404 405 unsigned i, tail, old_head; 405 - struct io_uring_params p; 406 + struct io_uring_params *p = &config.p; 407 + struct io_rings_layout *rl = &config.layout; 406 408 int ret; 409 + 410 + memset(&config, 0, sizeof(config)); 407 411 408 412 /* limited to DEFER_TASKRUN for now */ 409 413 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 410 414 return -EINVAL; 411 - if (copy_from_user(&p, arg, sizeof(p))) 415 + if (copy_from_user(p, arg, sizeof(*p))) 412 416 return -EFAULT; 413 - if (p.flags & ~RESIZE_FLAGS) 417 + if (p->flags & ~RESIZE_FLAGS) 414 418 return -EINVAL; 415 419 416 420 /* properties that are always inherited */ 417 - p.flags |= (ctx->flags & COPY_FLAGS); 421 + p->flags |= (ctx->flags & COPY_FLAGS); 418 422 419 - ret = io_uring_fill_params(p.sq_entries, &p); 423 + ret = io_prepare_config(&config); 420 424 if (unlikely(ret)) 421 425 return ret; 422 426 423 - size = rings_size(p.flags, p.sq_entries, p.cq_entries, 424 - &sq_array_offset); 425 - if (size == SIZE_MAX) 426 - return -EOVERFLOW; 427 - 428 427 memset(&rd, 0, sizeof(rd)); 429 - rd.size = PAGE_ALIGN(size); 430 - if (p.flags & IORING_SETUP_NO_MMAP) { 431 - rd.user_addr = p.cq_off.user_addr; 428 + rd.size = PAGE_ALIGN(rl->rings_size); 429 + if (p->flags & IORING_SETUP_NO_MMAP) { 430 + rd.user_addr = p->cq_off.user_addr; 432 431 rd.flags |= IORING_MEM_REGION_TYPE_USER; 433 432 } 434 - ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 435 - if (ret) { 436 - io_register_free_rings(ctx, &p, &n); 433 + ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 434 + if (ret) 437 435 return ret; 438 - } 436 + 439 437 n.rings = io_region_get_ptr(&n.ring_region); 440 438 441 439 /* ··· 443 447 * intent... Use read/write once helpers from here on to indicate the 444 448 * shared nature of it. 445 449 */ 446 - WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); 447 - WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); 448 - WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); 449 - WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 450 + WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); 451 + WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); 452 + WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); 453 + WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); 450 454 451 - if (copy_to_user(arg, &p, sizeof(p))) { 452 - io_register_free_rings(ctx, &p, &n); 455 + if (copy_to_user(arg, p, sizeof(*p))) { 456 + io_register_free_rings(ctx, &n); 453 457 return -EFAULT; 454 458 } 455 459 456 - if (p.flags & IORING_SETUP_SQE128) 457 - size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 458 - else 459 - size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 460 - if (size == SIZE_MAX) { 461 - io_register_free_rings(ctx, &p, &n); 462 - return -EOVERFLOW; 463 - } 464 - 465 460 memset(&rd, 0, sizeof(rd)); 466 - rd.size = PAGE_ALIGN(size); 467 - if (p.flags & IORING_SETUP_NO_MMAP) { 468 - rd.user_addr = p.sq_off.user_addr; 461 + rd.size = PAGE_ALIGN(rl->sq_size); 462 + if (p->flags & IORING_SETUP_NO_MMAP) { 463 + rd.user_addr = p->sq_off.user_addr; 469 464 rd.flags |= IORING_MEM_REGION_TYPE_USER; 470 465 } 471 - ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 466 + ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 472 467 if (ret) { 473 - io_register_free_rings(ctx, &p, &n); 468 + io_register_free_rings(ctx, &n); 474 469 return ret; 475 470 } 476 471 n.sq_sqes = io_region_get_ptr(&n.sq_region); ··· 497 510 */ 498 511 tail = READ_ONCE(o.rings->sq.tail); 499 512 old_head = READ_ONCE(o.rings->sq.head); 500 - if (tail - old_head > p.sq_entries) 513 + if (tail - old_head > p->sq_entries) 501 514 goto overflow; 502 515 for (i = old_head; i < tail; i++) { 503 516 unsigned src_head = i & (ctx->sq_entries - 1); 504 - unsigned dst_head = i & (p.sq_entries - 1); 517 + unsigned dst_head = i & (p->sq_entries - 1); 505 518 506 519 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 507 520 } ··· 510 523 511 524 tail = READ_ONCE(o.rings->cq.tail); 512 525 old_head = READ_ONCE(o.rings->cq.head); 513 - if (tail - old_head > p.cq_entries) { 526 + if (tail - old_head > p->cq_entries) { 514 527 overflow: 515 528 /* restore old rings, and return -EOVERFLOW via cleanup path */ 516 529 ctx->rings = o.rings; ··· 521 534 } 522 535 for (i = old_head; i < tail; i++) { 523 536 unsigned src_head = i & (ctx->cq_entries - 1); 524 - unsigned dst_head = i & (p.cq_entries - 1); 537 + unsigned dst_head = i & (p->cq_entries - 1); 525 538 526 539 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 527 540 } ··· 537 550 538 551 /* all done, store old pointers and assign new ones */ 539 552 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 540 - ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 553 + ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); 541 554 542 - ctx->sq_entries = p.sq_entries; 543 - ctx->cq_entries = p.cq_entries; 555 + ctx->sq_entries = p->sq_entries; 556 + ctx->cq_entries = p->cq_entries; 544 557 545 558 ctx->rings = n.rings; 546 559 ctx->sq_sqes = n.sq_sqes; ··· 551 564 out: 552 565 spin_unlock(&ctx->completion_lock); 553 566 mutex_unlock(&ctx->mmap_lock); 554 - io_register_free_rings(ctx, &p, to_free); 567 + io_register_free_rings(ctx, to_free); 555 568 556 569 if (ctx->sq_data) 557 570 io_sq_thread_unpark(ctx->sq_data); ··· 565 578 struct io_uring_mem_region_reg reg; 566 579 struct io_uring_region_desc __user *rd_uptr; 567 580 struct io_uring_region_desc rd; 581 + struct io_mapped_region region = {}; 568 582 int ret; 569 583 570 584 if (io_region_is_set(&ctx->param_region)) ··· 589 601 !(ctx->flags & IORING_SETUP_R_DISABLED)) 590 602 return -EINVAL; 591 603 592 - ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, 593 - IORING_MAP_OFF_PARAM_REGION); 604 + ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION); 594 605 if (ret) 595 606 return ret; 596 607 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 597 - guard(mutex)(&ctx->mmap_lock); 598 - io_free_region(ctx, &ctx->param_region); 608 + io_free_region(ctx->user, &region); 599 609 return -EFAULT; 600 610 } 601 611 602 612 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 603 - ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); 613 + ctx->cq_wait_arg = io_region_get_ptr(&region); 604 614 ctx->cq_wait_size = rd.size; 605 615 } 616 + 617 + io_region_publish(ctx, &region, &ctx->param_region); 606 618 return 0; 607 619 } 608 620 ··· 813 825 ret = io_register_mem_region(ctx, arg); 814 826 break; 815 827 case IORING_REGISTER_QUERY: 816 - ret = io_query(ctx, arg, nr_args); 828 + ret = io_query(arg, nr_args); 829 + break; 830 + case IORING_REGISTER_ZCRX_CTRL: 831 + ret = io_zcrx_ctrl(ctx, arg, nr_args); 817 832 break; 818 833 default: 819 834 ret = -EINVAL; ··· 888 897 case IORING_REGISTER_SEND_MSG_RING: 889 898 return io_uring_register_send_msg_ring(arg, nr_args); 890 899 case IORING_REGISTER_QUERY: 891 - return io_query(NULL, arg, nr_args); 900 + return io_query(arg, nr_args); 892 901 } 893 902 return -EINVAL; 894 903 }
+16 -14
io_uring/rsrc.c
··· 56 56 return 0; 57 57 } 58 58 59 - void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 + void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 + unsigned long nr_pages) 60 61 { 61 - if (ctx->user) 62 - __io_unaccount_mem(ctx->user, nr_pages); 62 + if (user) 63 + __io_unaccount_mem(user, nr_pages); 63 64 64 - if (ctx->mm_account) 65 - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 + if (mm_account) 66 + atomic64_sub(nr_pages, &mm_account->pinned_vm); 66 67 } 67 68 68 - int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 69 + int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 + unsigned long nr_pages) 69 71 { 70 72 int ret; 71 73 72 - if (ctx->user) { 73 - ret = __io_account_mem(ctx->user, nr_pages); 74 + if (user) { 75 + ret = __io_account_mem(user, nr_pages); 74 76 if (ret) 75 77 return ret; 76 78 } 77 79 78 - if (ctx->mm_account) 79 - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 80 + if (mm_account) 81 + atomic64_add(nr_pages, &mm_account->pinned_vm); 80 82 81 83 return 0; 82 84 } ··· 147 145 } 148 146 149 147 if (imu->acct_pages) 150 - io_unaccount_mem(ctx, imu->acct_pages); 148 + io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 151 149 imu->release(imu->priv); 152 150 io_free_imu(ctx, imu); 153 151 } ··· 456 454 return -ENXIO; 457 455 458 456 for (done = 0; done < up->nr_args; done++) { 459 - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 457 + if (get_user(fd, &fds[done])) { 460 458 ret = -EFAULT; 461 459 break; 462 460 } ··· 470 468 IORING_FILE_INDEX_ALLOC); 471 469 if (ret < 0) 472 470 break; 473 - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 471 + if (put_user(ret, &fds[done])) { 474 472 __io_close_fixed(req->ctx, issue_flags, ret); 475 473 ret = -EFAULT; 476 474 break; ··· 686 684 if (!imu->acct_pages) 687 685 return 0; 688 686 689 - ret = io_account_mem(ctx, imu->acct_pages); 687 + ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 690 688 if (ret) 691 689 imu->acct_pages = 0; 692 690 return ret;
+4 -2
io_uring/rsrc.h
··· 120 120 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 121 121 122 122 int __io_account_mem(struct user_struct *user, unsigned long nr_pages); 123 - int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); 124 - void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); 123 + int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 124 + unsigned long nr_pages); 125 + void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 126 + unsigned long nr_pages); 125 127 126 128 static inline void __io_unaccount_mem(struct user_struct *user, 127 129 unsigned long nr_pages)
+7 -5
io_uring/rw.c
··· 186 186 * This is really a bug in the core code that does this, any issue 187 187 * path should assume that a successful (or -EIOCBQUEUED) return can 188 188 * mean that the underlying data can be gone at any time. But that 189 - * should be fixed seperately, and then this check could be killed. 189 + * should be fixed separately, and then this check could be killed. 190 190 */ 191 191 if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { 192 192 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 348 348 349 349 /* 350 350 * Have to do this validation here, as this is in io_read() rw->len 351 - * might have chanaged due to buffer selection 351 + * might have changed due to buffer selection 352 352 */ 353 353 return io_iov_buffer_select_prep(req); 354 354 } ··· 566 566 return res; 567 567 } 568 568 569 - void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) 569 + void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw) 570 570 { 571 + struct io_kiocb *req = tw_req.req; 572 + 571 573 io_req_io_end(req); 572 574 573 575 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 574 576 req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 575 577 576 578 io_req_rw_cleanup(req, 0); 577 - io_req_task_complete(req, tw); 579 + io_req_task_complete(tw_req, tw); 578 580 } 579 581 580 582 static void io_complete_rw(struct kiocb *kiocb, long res) ··· 1012 1010 iov_iter_restore(&io->iter, &io->iter_state); 1013 1011 } while (ret > 0); 1014 1012 done: 1015 - /* it's faster to check here then delegate to kfree */ 1013 + /* it's faster to check here than delegate to kfree */ 1016 1014 return ret; 1017 1015 } 1018 1016
+1 -1
io_uring/rw.h
··· 46 46 int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); 47 47 void io_readv_writev_cleanup(struct io_kiocb *req); 48 48 void io_rw_fail(struct io_kiocb *req); 49 - void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); 49 + void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw); 50 50 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 51 51 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); 52 52 void io_rw_cache_free(const void *entry);
-18
io_uring/slist.h
··· 67 67 last->next = NULL; 68 68 } 69 69 70 - static inline void __wq_list_splice(struct io_wq_work_list *list, 71 - struct io_wq_work_node *to) 72 - { 73 - list->last->next = to->next; 74 - to->next = list->first; 75 - INIT_WQ_LIST(list); 76 - } 77 - 78 - static inline bool wq_list_splice(struct io_wq_work_list *list, 79 - struct io_wq_work_node *to) 80 - { 81 - if (!wq_list_empty(list)) { 82 - __wq_list_splice(list, to); 83 - return true; 84 - } 85 - return false; 86 - } 87 - 88 70 static inline void wq_stack_add_head(struct io_wq_work_node *node, 89 71 struct io_wq_work_node *stack) 90 72 {
+1
io_uring/sqpoll.c
··· 19 19 #include "io_uring.h" 20 20 #include "tctx.h" 21 21 #include "napi.h" 22 + #include "cancel.h" 22 23 #include "sqpoll.h" 23 24 24 25 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
+12 -8
io_uring/timeout.c
··· 68 68 69 69 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); 70 70 71 - static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) 71 + static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) 72 72 { 73 + struct io_kiocb *req = tw_req.req; 73 74 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 74 75 struct io_timeout_data *data = req->async_data; 75 76 struct io_ring_ctx *ctx = req->ctx; ··· 86 85 } 87 86 } 88 87 89 - io_req_task_complete(req, tw); 88 + io_req_task_complete(tw_req, tw); 90 89 } 91 90 92 91 static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) ··· 158 157 io_flush_killed_timeouts(&list, 0); 159 158 } 160 159 161 - static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) 160 + static void io_req_tw_fail_links(struct io_tw_req tw_req, io_tw_token_t tw) 162 161 { 162 + struct io_kiocb *link = tw_req.req; 163 + 163 164 io_tw_lock(link->ctx, tw); 164 165 while (link) { 165 166 struct io_kiocb *nxt = link->link; ··· 171 168 res = link->cqe.res; 172 169 link->link = NULL; 173 170 io_req_set_res(link, res, 0); 174 - io_req_task_complete(link, tw); 171 + io_req_task_complete((struct io_tw_req){link}, tw); 175 172 link = nxt; 176 173 } 177 174 } ··· 320 317 return 0; 321 318 } 322 319 323 - static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) 320 + static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) 324 321 { 322 + struct io_kiocb *req = tw_req.req; 325 323 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 326 324 struct io_kiocb *prev = timeout->prev; 327 325 int ret; 328 326 329 327 if (prev) { 330 - if (!io_should_terminate_tw(req->ctx)) { 328 + if (!tw.cancel) { 331 329 struct io_cancel_data cd = { 332 330 .ctx = req->ctx, 333 331 .data = prev->cqe.user_data, ··· 339 335 ret = -ECANCELED; 340 336 } 341 337 io_req_set_res(req, ret ?: -ETIME, 0); 342 - io_req_task_complete(req, tw); 338 + io_req_task_complete(tw_req, tw); 343 339 io_put_req(prev); 344 340 } else { 345 341 io_req_set_res(req, -ETIME, 0); 346 - io_req_task_complete(req, tw); 342 + io_req_task_complete(tw_req, tw); 347 343 } 348 344 } 349 345
+17 -17
io_uring/uring_cmd.c
··· 113 113 } 114 114 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); 115 115 116 - static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) 117 - { 118 - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 119 - unsigned int flags = IO_URING_F_COMPLETE_DEFER; 120 - 121 - if (io_should_terminate_tw(req->ctx)) 122 - flags |= IO_URING_F_TASK_DEAD; 123 - 124 - /* task_work executor checks the deffered list completion */ 125 - ioucmd->task_work_cb(ioucmd, flags); 126 - } 127 - 128 116 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 129 - io_uring_cmd_tw_t task_work_cb, 117 + io_req_tw_func_t task_work_cb, 130 118 unsigned flags) 131 119 { 132 120 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ··· 122 134 if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) 123 135 return; 124 136 125 - ioucmd->task_work_cb = task_work_cb; 126 - req->io_task_work.func = io_uring_cmd_work; 137 + req->io_task_work.func = task_work_cb; 127 138 __io_req_task_work_add(req, flags); 128 139 } 129 140 EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); ··· 203 216 return 0; 204 217 } 205 218 219 + /* 220 + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 221 + * slot. 222 + */ 223 + static inline size_t uring_sqe_size(struct io_kiocb *req) 224 + { 225 + if (req->ctx->flags & IORING_SETUP_SQE128 || 226 + req->opcode == IORING_OP_URING_CMD128) 227 + return 2 * sizeof(struct io_uring_sqe); 228 + return sizeof(struct io_uring_sqe); 229 + } 230 + 206 231 void io_uring_cmd_sqe_copy(struct io_kiocb *req) 207 232 { 208 233 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); ··· 223 224 /* Should not happen, as REQ_F_SQE_COPIED covers this */ 224 225 if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) 225 226 return; 226 - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); 227 + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req)); 227 228 ioucmd->sqe = ac->sqes; 228 229 } 229 230 ··· 241 242 if (ret) 242 243 return ret; 243 244 244 - if (ctx->flags & IORING_SETUP_SQE128) 245 + if (ctx->flags & IORING_SETUP_SQE128 || 246 + req->opcode == IORING_OP_URING_CMD128) 245 247 issue_flags |= IO_URING_F_SQE128; 246 248 if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) 247 249 issue_flags |= IO_URING_F_CQE32;
+35 -13
io_uring/waitid.c
··· 16 16 #include "waitid.h" 17 17 #include "../kernel/exit.h" 18 18 19 - static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); 19 + static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); 20 20 21 21 #define IO_WAITID_CANCEL_FLAG BIT(31) 22 22 #define IO_WAITID_REF_MASK GENMASK(30, 0) ··· 109 109 return ret; 110 110 } 111 111 112 + static void io_waitid_remove_wq(struct io_kiocb *req) 113 + { 114 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 115 + struct wait_queue_head *head; 116 + 117 + head = READ_ONCE(iw->head); 118 + if (head) { 119 + struct io_waitid_async *iwa = req->async_data; 120 + 121 + iw->head = NULL; 122 + spin_lock_irq(&head->lock); 123 + list_del_init(&iwa->wo.child_wait.entry); 124 + spin_unlock_irq(&head->lock); 125 + } 126 + } 127 + 112 128 static void io_waitid_complete(struct io_kiocb *req, int ret) 113 129 { 114 130 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); ··· 135 119 lockdep_assert_held(&req->ctx->uring_lock); 136 120 137 121 hlist_del_init(&req->hash_node); 122 + io_waitid_remove_wq(req); 138 123 139 124 ret = io_waitid_finish(req, ret); 140 125 if (ret < 0) ··· 146 129 static bool __io_waitid_cancel(struct io_kiocb *req) 147 130 { 148 131 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 149 - struct io_waitid_async *iwa = req->async_data; 132 + 133 + lockdep_assert_held(&req->ctx->uring_lock); 150 134 151 135 /* 152 136 * Mark us canceled regardless of ownership. This will prevent a ··· 159 141 if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) 160 142 return false; 161 143 162 - spin_lock_irq(&iw->head->lock); 163 - list_del_init(&iwa->wo.child_wait.entry); 164 - spin_unlock_irq(&iw->head->lock); 165 144 io_waitid_complete(req, -ECANCELED); 166 145 io_req_queue_tw_complete(req, -ECANCELED); 167 146 return true; ··· 179 164 static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) 180 165 { 181 166 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 182 - struct io_waitid_async *iwa = req->async_data; 183 167 184 168 if (!atomic_sub_return(1, &iw->refs)) 185 169 return false; 170 + 171 + io_waitid_remove_wq(req); 186 172 187 173 /* 188 174 * Wakeup triggered, racing with us. It was prevented from ··· 191 175 */ 192 176 req->io_task_work.func = io_waitid_cb; 193 177 io_req_task_work_add(req); 194 - remove_wait_queue(iw->head, &iwa->wo.child_wait); 195 178 return true; 196 179 } 197 180 198 - static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) 181 + static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) 199 182 { 183 + struct io_kiocb *req = tw_req.req; 200 184 struct io_waitid_async *iwa = req->async_data; 201 185 struct io_ring_ctx *ctx = req->ctx; 202 186 int ret; ··· 225 209 io_waitid_drop_issue_ref(req); 226 210 return; 227 211 } 228 - 229 - remove_wait_queue(iw->head, &iwa->wo.child_wait); 212 + /* fall through to complete, will kill waitqueue */ 230 213 } 231 214 } 232 215 233 216 io_waitid_complete(req, ret); 234 - io_req_task_complete(req, tw); 217 + io_req_task_complete(tw_req, tw); 235 218 } 236 219 237 220 static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, ··· 246 231 return 0; 247 232 248 233 list_del_init(&wait->entry); 234 + iw->head = NULL; 249 235 250 236 /* cancel is in progress */ 251 237 if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) ··· 273 257 iw->which = READ_ONCE(sqe->len); 274 258 iw->upid = READ_ONCE(sqe->fd); 275 259 iw->options = READ_ONCE(sqe->file_index); 260 + iw->head = NULL; 276 261 iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 277 262 return 0; 278 263 } ··· 304 287 * callback. 305 288 */ 306 289 io_ring_submit_lock(ctx, issue_flags); 290 + 291 + /* 292 + * iw->head is valid under the ring lock, and as long as the request 293 + * is on the waitid_list where cancelations may find it. 294 + */ 295 + iw->head = &current->signal->wait_chldexit; 307 296 hlist_add_head(&req->hash_node, &ctx->waitid_list); 308 297 309 298 init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); 310 299 iwa->wo.child_wait.private = req->tctx->task; 311 - iw->head = &current->signal->wait_chldexit; 312 300 add_wait_queue(iw->head, &iwa->wo.child_wait); 313 301 314 302 ret = __do_wait(&iwa->wo); ··· 336 314 } 337 315 338 316 hlist_del_init(&req->hash_node); 339 - remove_wait_queue(iw->head, &iwa->wo.child_wait); 317 + io_waitid_remove_wq(req); 340 318 ret = io_waitid_finish(req, ret); 341 319 342 320 io_ring_submit_unlock(ctx, issue_flags);
+324 -97
io_uring/zcrx.c
··· 8 8 #include <linux/netdevice.h> 9 9 #include <linux/rtnetlink.h> 10 10 #include <linux/skbuff_ref.h> 11 + #include <linux/anon_inodes.h> 11 12 12 13 #include <net/page_pool/helpers.h> 13 14 #include <net/page_pool/memory_provider.h> ··· 171 170 if (folio == last_folio) 172 171 continue; 173 172 last_folio = folio; 174 - res += 1UL << folio_order(folio); 173 + res += folio_nr_pages(folio); 175 174 } 176 175 return res; 177 176 } ··· 201 200 } 202 201 203 202 mem->account_pages = io_count_account_pages(pages, nr_pages); 204 - ret = io_account_mem(ifq->ctx, mem->account_pages); 203 + ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); 205 204 if (ret < 0) 206 205 mem->account_pages = 0; 207 206 ··· 345 344 atomic_inc(io_get_user_counter(niov)); 346 345 } 347 346 348 - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 347 + static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) 348 + { 349 + offsets->head = offsetof(struct io_uring, head); 350 + offsets->tail = offsetof(struct io_uring, tail); 351 + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 352 + } 353 + 354 + static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, 355 + struct io_zcrx_ifq *ifq, 349 356 struct io_uring_zcrx_ifq_reg *reg, 350 357 struct io_uring_region_desc *rd, 351 358 u32 id) ··· 363 354 void *ptr; 364 355 int ret; 365 356 366 - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 357 + io_fill_zcrx_offsets(&reg->offsets); 358 + off = reg->offsets.rqes; 367 359 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 368 360 if (size > rd->size) 369 361 return -EINVAL; ··· 372 362 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 373 363 mmap_offset += id << IORING_OFF_PBUF_SHIFT; 374 364 375 - ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); 365 + ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); 376 366 if (ret < 0) 377 367 return ret; 378 368 ··· 380 370 ifq->rq_ring = (struct io_uring *)ptr; 381 371 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 382 372 383 - reg->offsets.head = offsetof(struct io_uring, head); 384 - reg->offsets.tail = offsetof(struct io_uring, tail); 385 - reg->offsets.rqes = off; 386 373 return 0; 387 374 } 388 375 389 376 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 390 377 { 391 - io_free_region(ifq->ctx, &ifq->region); 378 + io_free_region(ifq->user, &ifq->region); 392 379 ifq->rq_ring = NULL; 393 380 ifq->rqes = NULL; 394 381 } 395 382 396 - static void io_zcrx_free_area(struct io_zcrx_area *area) 383 + static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, 384 + struct io_zcrx_area *area) 397 385 { 398 - io_zcrx_unmap_area(area->ifq, area); 386 + io_zcrx_unmap_area(ifq, area); 399 387 io_release_area_mem(&area->mem); 400 388 401 389 if (area->mem.account_pages) 402 - io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); 390 + io_unaccount_mem(ifq->user, ifq->mm_account, 391 + area->mem.account_pages); 403 392 404 393 kvfree(area->freelist); 405 394 kvfree(area->nia.niovs); ··· 472 463 return 0; 473 464 err: 474 465 if (area) 475 - io_zcrx_free_area(area); 466 + io_zcrx_free_area(ifq, area); 476 467 return ret; 477 468 } 478 469 ··· 485 476 return NULL; 486 477 487 478 ifq->if_rxq = -1; 488 - ifq->ctx = ctx; 489 479 spin_lock_init(&ifq->rq_lock); 490 480 mutex_init(&ifq->pp_lock); 481 + refcount_set(&ifq->refs, 1); 482 + refcount_set(&ifq->user_refs, 1); 491 483 return ifq; 492 484 } 493 485 ··· 532 522 io_close_queue(ifq); 533 523 534 524 if (ifq->area) 535 - io_zcrx_free_area(ifq->area); 525 + io_zcrx_free_area(ifq, ifq->area); 526 + free_uid(ifq->user); 527 + if (ifq->mm_account) 528 + mmdrop(ifq->mm_account); 536 529 if (ifq->dev) 537 530 put_device(ifq->dev); 538 531 539 532 io_free_rbuf_ring(ifq); 540 533 mutex_destroy(&ifq->pp_lock); 541 534 kfree(ifq); 535 + } 536 + 537 + static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) 538 + { 539 + if (refcount_dec_and_test(&ifq->refs)) 540 + io_zcrx_ifq_free(ifq); 541 + } 542 + 543 + static void io_zcrx_return_niov_freelist(struct net_iov *niov) 544 + { 545 + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 546 + 547 + spin_lock_bh(&area->freelist_lock); 548 + area->freelist[area->free_count++] = net_iov_idx(niov); 549 + spin_unlock_bh(&area->freelist_lock); 550 + } 551 + 552 + static void io_zcrx_return_niov(struct net_iov *niov) 553 + { 554 + netmem_ref netmem = net_iov_to_netmem(niov); 555 + 556 + if (!niov->desc.pp) { 557 + /* copy fallback allocated niovs */ 558 + io_zcrx_return_niov_freelist(niov); 559 + return; 560 + } 561 + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); 562 + } 563 + 564 + static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 565 + { 566 + struct io_zcrx_area *area = ifq->area; 567 + int i; 568 + 569 + if (!area) 570 + return; 571 + 572 + /* Reclaim back all buffers given to the user space. */ 573 + for (i = 0; i < area->nia.num_niovs; i++) { 574 + struct net_iov *niov = &area->nia.niovs[i]; 575 + int nr; 576 + 577 + if (!atomic_read(io_get_user_counter(niov))) 578 + continue; 579 + nr = atomic_xchg(io_get_user_counter(niov), 0); 580 + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 581 + io_zcrx_return_niov(niov); 582 + } 583 + } 584 + 585 + static void zcrx_unregister(struct io_zcrx_ifq *ifq) 586 + { 587 + if (refcount_dec_and_test(&ifq->user_refs)) { 588 + io_close_queue(ifq); 589 + io_zcrx_scrub(ifq); 590 + } 591 + io_put_zcrx_ifq(ifq); 542 592 } 543 593 544 594 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, ··· 609 539 lockdep_assert_held(&ctx->mmap_lock); 610 540 611 541 return ifq ? &ifq->region : NULL; 542 + } 543 + 544 + static int zcrx_box_release(struct inode *inode, struct file *file) 545 + { 546 + struct io_zcrx_ifq *ifq = file->private_data; 547 + 548 + if (WARN_ON_ONCE(!ifq)) 549 + return -EFAULT; 550 + zcrx_unregister(ifq); 551 + return 0; 552 + } 553 + 554 + static const struct file_operations zcrx_box_fops = { 555 + .owner = THIS_MODULE, 556 + .release = zcrx_box_release, 557 + }; 558 + 559 + static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, 560 + struct zcrx_ctrl *ctrl, void __user *arg) 561 + { 562 + struct zcrx_ctrl_export *ce = &ctrl->zc_export; 563 + struct file *file; 564 + int fd = -1; 565 + 566 + if (!mem_is_zero(ce, sizeof(*ce))) 567 + return -EINVAL; 568 + fd = get_unused_fd_flags(O_CLOEXEC); 569 + if (fd < 0) 570 + return fd; 571 + 572 + ce->zcrx_fd = fd; 573 + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { 574 + put_unused_fd(fd); 575 + return -EFAULT; 576 + } 577 + 578 + refcount_inc(&ifq->refs); 579 + refcount_inc(&ifq->user_refs); 580 + 581 + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, 582 + ifq, O_CLOEXEC, NULL); 583 + if (IS_ERR(file)) { 584 + put_unused_fd(fd); 585 + zcrx_unregister(ifq); 586 + return PTR_ERR(file); 587 + } 588 + 589 + fd_install(fd, file); 590 + return 0; 591 + } 592 + 593 + static int import_zcrx(struct io_ring_ctx *ctx, 594 + struct io_uring_zcrx_ifq_reg __user *arg, 595 + struct io_uring_zcrx_ifq_reg *reg) 596 + { 597 + struct io_zcrx_ifq *ifq; 598 + struct file *file; 599 + int fd, ret; 600 + u32 id; 601 + 602 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 603 + return -EINVAL; 604 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 605 + return -EINVAL; 606 + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) 607 + return -EINVAL; 608 + 609 + fd = reg->if_idx; 610 + CLASS(fd, f)(fd); 611 + if (fd_empty(f)) 612 + return -EBADF; 613 + 614 + file = fd_file(f); 615 + if (file->f_op != &zcrx_box_fops || !file->private_data) 616 + return -EBADF; 617 + 618 + ifq = file->private_data; 619 + refcount_inc(&ifq->refs); 620 + refcount_inc(&ifq->user_refs); 621 + 622 + scoped_guard(mutex, &ctx->mmap_lock) { 623 + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 624 + if (ret) 625 + goto err; 626 + } 627 + 628 + reg->zcrx_id = id; 629 + io_fill_zcrx_offsets(&reg->offsets); 630 + if (copy_to_user(arg, reg, sizeof(*reg))) { 631 + ret = -EFAULT; 632 + goto err_xa_erase; 633 + } 634 + 635 + scoped_guard(mutex, &ctx->mmap_lock) { 636 + ret = -ENOMEM; 637 + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 638 + goto err_xa_erase; 639 + } 640 + 641 + return 0; 642 + err_xa_erase: 643 + scoped_guard(mutex, &ctx->mmap_lock) 644 + xa_erase(&ctx->zcrx_ctxs, id); 645 + err: 646 + zcrx_unregister(ifq); 647 + return ret; 612 648 } 613 649 614 650 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ··· 742 566 return -EINVAL; 743 567 if (copy_from_user(&reg, arg, sizeof(reg))) 744 568 return -EFAULT; 745 - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 746 - return -EFAULT; 747 569 if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || 748 570 reg.__resv2 || reg.zcrx_id) 749 571 return -EINVAL; 572 + if (reg.flags & ZCRX_REG_IMPORT) 573 + return import_zcrx(ctx, arg, &reg); 574 + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 575 + return -EFAULT; 750 576 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 751 577 return -EINVAL; 752 578 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { ··· 764 586 ifq = io_zcrx_ifq_alloc(ctx); 765 587 if (!ifq) 766 588 return -ENOMEM; 589 + 590 + if (ctx->user) { 591 + get_uid(ctx->user); 592 + ifq->user = ctx->user; 593 + } 594 + if (ctx->mm_account) { 595 + mmgrab(ctx->mm_account); 596 + ifq->mm_account = ctx->mm_account; 597 + } 767 598 ifq->rq_entries = reg.rq_entries; 768 599 769 600 scoped_guard(mutex, &ctx->mmap_lock) { ··· 782 595 goto ifq_free; 783 596 } 784 597 785 - ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id); 598 + ret = io_allocate_rbuf_ring(ctx, ifq, &reg, &rd, id); 786 599 if (ret) 787 600 goto err; 788 601 789 - ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 790 - &ifq->netdev_tracker, GFP_KERNEL); 602 + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); 791 603 if (!ifq->netdev) { 792 604 ret = -ENODEV; 793 605 goto err; 794 606 } 607 + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 795 608 796 609 ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); 797 610 if (!ifq->dev) { 798 611 ret = -EOPNOTSUPP; 799 - goto err; 612 + goto netdev_put_unlock; 800 613 } 801 614 get_device(ifq->dev); 802 615 803 616 ret = io_zcrx_create_area(ifq, &area); 804 617 if (ret) 805 - goto err; 618 + goto netdev_put_unlock; 806 619 807 620 mp_param.mp_ops = &io_uring_pp_zc_ops; 808 621 mp_param.mp_priv = ifq; 809 - ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 622 + ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); 810 623 if (ret) 811 - goto err; 624 + goto netdev_put_unlock; 625 + netdev_unlock(ifq->netdev); 812 626 ifq->if_rxq = reg.if_rxq; 813 627 814 628 reg.zcrx_id = id; ··· 828 640 goto err; 829 641 } 830 642 return 0; 643 + netdev_put_unlock: 644 + netdev_put(ifq->netdev, &ifq->netdev_tracker); 645 + netdev_unlock(ifq->netdev); 831 646 err: 832 647 scoped_guard(mutex, &ctx->mmap_lock) 833 648 xa_erase(&ctx->zcrx_ctxs, id); 834 649 ifq_free: 835 650 io_zcrx_ifq_free(ifq); 836 651 return ret; 652 + } 653 + 654 + static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 655 + { 656 + unsigned niov_idx; 657 + 658 + lockdep_assert_held(&area->freelist_lock); 659 + 660 + niov_idx = area->freelist[--area->free_count]; 661 + return &area->nia.niovs[niov_idx]; 837 662 } 838 663 839 664 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) ··· 865 664 } 866 665 if (!ifq) 867 666 break; 868 - io_zcrx_ifq_free(ifq); 667 + zcrx_unregister(ifq); 869 668 } 870 669 871 670 xa_destroy(&ctx->zcrx_ctxs); 872 - } 873 - 874 - static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 875 - { 876 - unsigned niov_idx; 877 - 878 - lockdep_assert_held(&area->freelist_lock); 879 - 880 - niov_idx = area->freelist[--area->free_count]; 881 - return &area->nia.niovs[niov_idx]; 882 - } 883 - 884 - static void io_zcrx_return_niov_freelist(struct net_iov *niov) 885 - { 886 - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 887 - 888 - spin_lock_bh(&area->freelist_lock); 889 - area->freelist[area->free_count++] = net_iov_idx(niov); 890 - spin_unlock_bh(&area->freelist_lock); 891 - } 892 - 893 - static void io_zcrx_return_niov(struct net_iov *niov) 894 - { 895 - netmem_ref netmem = net_iov_to_netmem(niov); 896 - 897 - if (!niov->pp) { 898 - /* copy fallback allocated niovs */ 899 - io_zcrx_return_niov_freelist(niov); 900 - return; 901 - } 902 - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 903 - } 904 - 905 - static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 906 - { 907 - struct io_zcrx_area *area = ifq->area; 908 - int i; 909 - 910 - if (!area) 911 - return; 912 - 913 - /* Reclaim back all buffers given to the user space. */ 914 - for (i = 0; i < area->nia.num_niovs; i++) { 915 - struct net_iov *niov = &area->nia.niovs[i]; 916 - int nr; 917 - 918 - if (!atomic_read(io_get_user_counter(niov))) 919 - continue; 920 - nr = atomic_xchg(io_get_user_counter(niov), 0); 921 - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 922 - io_zcrx_return_niov(niov); 923 - } 924 - } 925 - 926 - void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 927 - { 928 - struct io_zcrx_ifq *ifq; 929 - unsigned long index; 930 - 931 - lockdep_assert_held(&ctx->uring_lock); 932 - 933 - xa_for_each(&ctx->zcrx_ctxs, index, ifq) { 934 - io_zcrx_scrub(ifq); 935 - io_close_queue(ifq); 936 - } 937 671 } 938 672 939 673 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) ··· 936 800 if (!page_pool_unref_and_test(netmem)) 937 801 continue; 938 802 939 - if (unlikely(niov->pp != pp)) { 803 + if (unlikely(niov->desc.pp != pp)) { 940 804 io_zcrx_return_niov(niov); 941 805 continue; 942 806 } ··· 1016 880 if (ret) 1017 881 return ret; 1018 882 1019 - percpu_ref_get(&ifq->ctx->refs); 883 + refcount_inc(&ifq->refs); 1020 884 return 0; 1021 885 } 1022 886 1023 887 static void io_pp_zc_destroy(struct page_pool *pp) 1024 888 { 1025 - struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1026 - 1027 - percpu_ref_put(&ifq->ctx->refs); 889 + io_put_zcrx_ifq(io_pp_to_ifq(pp)); 1028 890 } 1029 891 1030 892 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, ··· 1061 927 .nl_fill = io_pp_nl_fill, 1062 928 .uninstall = io_pp_uninstall, 1063 929 }; 930 + 931 + static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, 932 + struct io_zcrx_ifq *zcrx) 933 + { 934 + unsigned int mask = zcrx->rq_entries - 1; 935 + unsigned int i; 936 + 937 + guard(spinlock_bh)(&zcrx->rq_lock); 938 + 939 + nr = min(nr, io_zcrx_rqring_entries(zcrx)); 940 + for (i = 0; i < nr; i++) { 941 + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); 942 + struct net_iov *niov; 943 + 944 + if (!io_parse_rqe(rqe, zcrx, &niov)) 945 + break; 946 + netmem_array[i] = net_iov_to_netmem(niov); 947 + } 948 + 949 + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); 950 + return i; 951 + } 952 + 953 + #define ZCRX_FLUSH_BATCH 32 954 + 955 + static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) 956 + { 957 + unsigned i; 958 + 959 + for (i = 0; i < nr; i++) { 960 + netmem_ref netmem = netmems[i]; 961 + struct net_iov *niov = netmem_to_net_iov(netmem); 962 + 963 + if (!io_zcrx_put_niov_uref(niov)) 964 + continue; 965 + if (!page_pool_unref_and_test(netmem)) 966 + continue; 967 + io_zcrx_return_niov(niov); 968 + } 969 + } 970 + 971 + static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 972 + struct zcrx_ctrl *ctrl) 973 + { 974 + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; 975 + netmem_ref netmems[ZCRX_FLUSH_BATCH]; 976 + unsigned total = 0; 977 + unsigned nr; 978 + 979 + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) 980 + return -EINVAL; 981 + 982 + do { 983 + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); 984 + 985 + zcrx_return_buffers(netmems, nr); 986 + total += nr; 987 + 988 + if (fatal_signal_pending(current)) 989 + break; 990 + cond_resched(); 991 + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); 992 + 993 + return 0; 994 + } 995 + 996 + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 997 + { 998 + struct zcrx_ctrl ctrl; 999 + struct io_zcrx_ifq *zcrx; 1000 + 1001 + if (nr_args) 1002 + return -EINVAL; 1003 + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) 1004 + return -EFAULT; 1005 + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) 1006 + return -EFAULT; 1007 + 1008 + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); 1009 + if (!zcrx) 1010 + return -ENXIO; 1011 + 1012 + switch (ctrl.op) { 1013 + case ZCRX_CTRL_FLUSH_RQ: 1014 + return zcrx_flush_rq(ctx, zcrx, &ctrl); 1015 + case ZCRX_CTRL_EXPORT: 1016 + return zcrx_export(ctx, zcrx, &ctrl, arg); 1017 + } 1018 + 1019 + return -EOPNOTSUPP; 1020 + } 1064 1021 1065 1022 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 1066 1023 struct io_zcrx_ifq *ifq, int off, int len) ··· 1294 1069 const skb_frag_t *frag, int off, int len) 1295 1070 { 1296 1071 struct net_iov *niov; 1072 + struct page_pool *pp; 1297 1073 1298 1074 if (unlikely(!skb_frag_is_net_iov(frag))) 1299 1075 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1300 1076 1301 1077 niov = netmem_to_net_iov(frag->netmem); 1302 - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1303 - io_pp_to_ifq(niov->pp) != ifq) 1078 + pp = niov->desc.pp; 1079 + 1080 + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) 1304 1081 return -EFAULT; 1305 1082 1306 1083 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))
+11 -5
io_uring/zcrx.h
··· 39 39 }; 40 40 41 41 struct io_zcrx_ifq { 42 - struct io_ring_ctx *ctx; 43 42 struct io_zcrx_area *area; 44 43 unsigned niov_shift; 44 + struct user_struct *user; 45 + struct mm_struct *mm_account; 45 46 46 47 spinlock_t rq_lock ____cacheline_aligned_in_smp; 47 48 struct io_uring *rq_ring; ··· 54 53 struct device *dev; 55 54 struct net_device *netdev; 56 55 netdevice_tracker netdev_tracker; 56 + refcount_t refs; 57 + /* counts userspace facing users like io_uring */ 58 + refcount_t user_refs; 57 59 58 60 /* 59 61 * Page pool and net configuration lock, can be taken deeper in the ··· 67 63 }; 68 64 69 65 #if defined(CONFIG_IO_URING_ZCRX) 66 + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); 70 67 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 71 68 struct io_uring_zcrx_ifq_reg __user *arg); 72 69 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); 73 - void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); 74 70 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 75 71 struct socket *sock, unsigned int flags, 76 72 unsigned issue_flags, unsigned int *len); ··· 85 81 static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 86 82 { 87 83 } 88 - static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 89 - { 90 - } 91 84 static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 92 85 struct socket *sock, unsigned int flags, 93 86 unsigned issue_flags, unsigned int *len) ··· 95 94 unsigned int id) 96 95 { 97 96 return NULL; 97 + } 98 + static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, 99 + void __user *arg, unsigned nr_arg) 100 + { 101 + return -EOPNOTSUPP; 98 102 } 99 103 #endif 100 104
+2 -2
net/compat.c
··· 460 460 ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); 461 461 break; 462 462 case SYS_GETSOCKNAME: 463 - ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); 463 + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0); 464 464 break; 465 465 case SYS_GETPEERNAME: 466 - ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); 466 + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1); 467 467 break; 468 468 case SYS_SOCKETPAIR: 469 469 ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
-1
net/core/dev.h
··· 29 29 netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); 30 30 struct net_device *dev_get_by_napi_id(unsigned int napi_id); 31 31 32 - struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); 33 32 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); 34 33 struct net_device * 35 34 netdev_xa_find_lock(struct net *net, struct net_device *dev,
+29 -54
net/socket.c
··· 2105 2105 return __sys_connect(fd, uservaddr, addrlen); 2106 2106 } 2107 2107 2108 - /* 2109 - * Get the local address ('name') of a socket object. Move the obtained 2110 - * name to user space. 2111 - */ 2108 + int do_getsockname(struct socket *sock, int peer, 2109 + struct sockaddr __user *usockaddr, int __user *usockaddr_len) 2110 + { 2111 + struct sockaddr_storage address; 2112 + int err; 2112 2113 2114 + if (peer) 2115 + err = security_socket_getpeername(sock); 2116 + else 2117 + err = security_socket_getsockname(sock); 2118 + if (err) 2119 + return err; 2120 + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); 2121 + if (err < 0) 2122 + return err; 2123 + /* "err" is actually length in this case */ 2124 + return move_addr_to_user(&address, err, usockaddr, usockaddr_len); 2125 + } 2126 + 2127 + /* 2128 + * Get the remote or local address ('name') of a socket object. Move the 2129 + * obtained name to user space. 2130 + */ 2113 2131 int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, 2114 - int __user *usockaddr_len) 2132 + int __user *usockaddr_len, int peer) 2115 2133 { 2116 2134 struct socket *sock; 2117 - struct sockaddr_storage address; 2118 2135 CLASS(fd, f)(fd); 2119 - int err; 2120 2136 2121 2137 if (fd_empty(f)) 2122 2138 return -EBADF; 2123 2139 sock = sock_from_file(fd_file(f)); 2124 2140 if (unlikely(!sock)) 2125 2141 return -ENOTSOCK; 2126 - 2127 - err = security_socket_getsockname(sock); 2128 - if (err) 2129 - return err; 2130 - 2131 - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); 2132 - if (err < 0) 2133 - return err; 2134 - 2135 - /* "err" is actually length in this case */ 2136 - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); 2142 + return do_getsockname(sock, peer, usockaddr, usockaddr_len); 2137 2143 } 2138 2144 2139 2145 SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, 2140 2146 int __user *, usockaddr_len) 2141 2147 { 2142 - return __sys_getsockname(fd, usockaddr, usockaddr_len); 2143 - } 2144 - 2145 - /* 2146 - * Get the remote address ('name') of a socket object. Move the obtained 2147 - * name to user space. 2148 - */ 2149 - 2150 - int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, 2151 - int __user *usockaddr_len) 2152 - { 2153 - struct socket *sock; 2154 - struct sockaddr_storage address; 2155 - CLASS(fd, f)(fd); 2156 - int err; 2157 - 2158 - if (fd_empty(f)) 2159 - return -EBADF; 2160 - sock = sock_from_file(fd_file(f)); 2161 - if (unlikely(!sock)) 2162 - return -ENOTSOCK; 2163 - 2164 - err = security_socket_getpeername(sock); 2165 - if (err) 2166 - return err; 2167 - 2168 - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1); 2169 - if (err < 0) 2170 - return err; 2171 - 2172 - /* "err" is actually length in this case */ 2173 - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); 2148 + return __sys_getsockname(fd, usockaddr, usockaddr_len, 0); 2174 2149 } 2175 2150 2176 2151 SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, 2177 2152 int __user *, usockaddr_len) 2178 2153 { 2179 - return __sys_getpeername(fd, usockaddr, usockaddr_len); 2154 + return __sys_getsockname(fd, usockaddr, usockaddr_len, 1); 2180 2155 } 2181 2156 2182 2157 /* ··· 3115 3140 case SYS_GETSOCKNAME: 3116 3141 err = 3117 3142 __sys_getsockname(a0, (struct sockaddr __user *)a1, 3118 - (int __user *)a[2]); 3143 + (int __user *)a[2], 0); 3119 3144 break; 3120 3145 case SYS_GETPEERNAME: 3121 3146 err = 3122 - __sys_getpeername(a0, (struct sockaddr __user *)a1, 3123 - (int __user *)a[2]); 3147 + __sys_getsockname(a0, (struct sockaddr __user *)a1, 3148 + (int __user *)a[2], 1); 3124 3149 break; 3125 3150 case SYS_SOCKETPAIR: 3126 3151 err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);