Merge tag 'for-6.19/io_uring-20251201' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

+4 -2

block/ioctl.c

··· 769 769 bool nowait; 770 770 }; 771 771 772 - static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) 772 + static void blk_cmd_complete(struct io_tw_req tw_req, io_tw_token_t tw) 773 773 { 774 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 774 775 struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); 775 776 776 777 if (bic->res == -EAGAIN && bic->nowait) 777 778 io_uring_cmd_issue_blocking(cmd); 778 779 else 779 - io_uring_cmd_done(cmd, bic->res, issue_flags); 780 + io_uring_cmd_done(cmd, bic->res, 781 + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); 780 782 } 781 783 782 784 static void bio_cmd_bio_end_io(struct bio *bio)

+11 -11

drivers/block/ublk_drv.c

··· 1302 1302 return true; 1303 1303 } 1304 1304 1305 - static void ublk_dispatch_req(struct ublk_queue *ubq, 1306 - struct request *req, 1307 - unsigned int issue_flags) 1305 + static void ublk_dispatch_req(struct ublk_queue *ubq, struct request *req) 1308 1306 { 1307 + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1309 1308 int tag = req->tag; 1310 1309 struct ublk_io *io = &ubq->ios[tag]; 1311 1310 ··· 1347 1348 ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags); 1348 1349 } 1349 1350 1350 - static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd, 1351 - unsigned int issue_flags) 1351 + static void ublk_cmd_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 1352 1352 { 1353 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1353 1354 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1354 1355 struct ublk_queue *ubq = pdu->ubq; 1355 1356 1356 - ublk_dispatch_req(ubq, pdu->req, issue_flags); 1357 + ublk_dispatch_req(ubq, pdu->req); 1357 1358 } 1358 1359 1359 1360 static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) ··· 1365 1366 io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb); 1366 1367 } 1367 1368 1368 - static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd, 1369 - unsigned int issue_flags) 1369 + static void ublk_cmd_list_tw_cb(struct io_tw_req tw_req, io_tw_token_t tw) 1370 1370 { 1371 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1371 1372 struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); 1372 1373 struct request *rq = pdu->req_list; 1373 1374 struct request *next; ··· 1375 1376 do { 1376 1377 next = rq->rq_next; 1377 1378 rq->rq_next = NULL; 1378 - ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags); 1379 + ublk_dispatch_req(rq->mq_hctx->driver_data, rq); 1379 1380 rq = next; 1380 1381 } while (rq); 1381 1382 } ··· 2522 2523 return NULL; 2523 2524 } 2524 2525 2525 - static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd, 2526 - unsigned int issue_flags) 2526 + static void ublk_ch_uring_cmd_cb(struct io_tw_req tw_req, io_tw_token_t tw) 2527 2527 { 2528 + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 2529 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 2528 2530 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 2529 2531 2530 2532 if (ret != -EIOCBQUEUED)

+4 -3

drivers/nvme/host/ioctl.c

··· 398 398 return io_uring_cmd_to_pdu(ioucmd, struct nvme_uring_cmd_pdu); 399 399 } 400 400 401 - static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd, 402 - unsigned issue_flags) 401 + static void nvme_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) 403 402 { 403 + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); 404 404 struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); 405 405 406 406 if (pdu->bio) 407 407 blk_rq_unmap_user(pdu->bio); 408 - io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); 408 + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 409 + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); 409 410 } 410 411 411 412 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,

+3 -2

fs/btrfs/ioctl.c

··· 4632 4632 struct btrfs_uring_priv *priv; 4633 4633 }; 4634 4634 4635 - static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags) 4635 + static void btrfs_uring_read_finished(struct io_tw_req tw_req, io_tw_token_t tw) 4636 4636 { 4637 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 4637 4638 struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd); 4638 4639 struct btrfs_uring_priv *priv = bc->priv; 4639 4640 struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp)); ··· 4679 4678 btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); 4680 4679 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 4681 4680 4682 - io_uring_cmd_done(cmd, ret, issue_flags); 4681 + io_uring_cmd_done(cmd, ret, IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); 4683 4682 add_rchar(current, ret); 4684 4683 4685 4684 for (index = 0; index < priv->nr_pages; index++)

+4 -3

fs/fuse/dev_uring.c

··· 1209 1209 * User buffers are not mapped yet - the application does not have permission 1210 1210 * to write to it - this has to be executed in ring task context. 1211 1211 */ 1212 - static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, 1213 - unsigned int issue_flags) 1212 + static void fuse_uring_send_in_task(struct io_tw_req tw_req, io_tw_token_t tw) 1214 1213 { 1214 + unsigned int issue_flags = IO_URING_CMD_TASK_WORK_ISSUE_FLAGS; 1215 + struct io_uring_cmd *cmd = io_uring_cmd_from_tw(tw_req); 1215 1216 struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); 1216 1217 struct fuse_ring_queue *queue = ent->queue; 1217 1218 int err; 1218 1219 1219 - if (!(issue_flags & IO_URING_F_TASK_DEAD)) { 1220 + if (!tw.cancel) { 1220 1221 err = fuse_uring_prepare_send(ent, ent->fuse_req); 1221 1222 if (err) { 1222 1223 fuse_uring_next_fuse_req(ent, queue, issue_flags);

+13 -9

include/linux/io_uring/cmd.h

··· 11 11 /* io_uring_cmd is being issued again */ 12 12 #define IORING_URING_CMD_REISSUE (1U << 31) 13 13 14 - typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, 15 - unsigned issue_flags); 16 - 17 14 struct io_uring_cmd { 18 15 struct file *file; 19 16 const struct io_uring_sqe *sqe; 20 - /* callback to defer completions to task context */ 21 - io_uring_cmd_tw_t task_work_cb; 22 17 u32 cmd_op; 23 18 u32 flags; 24 19 u8 pdu[32]; /* available inline for free use */ 20 + u8 unused[8]; 25 21 }; 26 22 27 23 static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) ··· 56 60 unsigned issue_flags, bool is_cqe32); 57 61 58 62 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 59 - io_uring_cmd_tw_t task_work_cb, 63 + io_req_tw_func_t task_work_cb, 60 64 unsigned flags); 61 65 62 66 /* ··· 105 109 { 106 110 } 107 111 static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 108 - io_uring_cmd_tw_t task_work_cb, unsigned flags) 112 + io_req_tw_func_t task_work_cb, unsigned flags) 109 113 { 110 114 } 111 115 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, ··· 128 132 } 129 133 #endif 130 134 135 + static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) 136 + { 137 + return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd); 138 + } 139 + 140 + /* task_work executor checks the deferred list completion */ 141 + #define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER 142 + 131 143 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ 132 144 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 133 - io_uring_cmd_tw_t task_work_cb) 145 + io_req_tw_func_t task_work_cb) 134 146 { 135 147 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); 136 148 } 137 149 138 150 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 139 - io_uring_cmd_tw_t task_work_cb) 151 + io_req_tw_func_t task_work_cb) 140 152 { 141 153 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); 142 154 }

+8 -4

include/linux/io_uring_types.h

··· 39 39 /* set when uring wants to cancel a previously issued command */ 40 40 IO_URING_F_CANCEL = (1 << 11), 41 41 IO_URING_F_COMPAT = (1 << 12), 42 - IO_URING_F_TASK_DEAD = (1 << 13), 43 42 }; 44 43 45 44 struct io_wq_work_node { ··· 327 328 328 329 /* 329 330 * Modifications are protected by ->uring_lock and ->mmap_lock. 330 - * The flags, buf_pages and buf_nr_pages fields should be stable 331 - * once published. 331 + * The buffer list's io mapped region should be stable once 332 + * published. 332 333 */ 333 334 struct xarray io_bl_xa; 334 335 ··· 473 474 * ONLY core io_uring.c should instantiate this struct. 474 475 */ 475 476 struct io_tw_state { 477 + bool cancel; 476 478 }; 477 479 /* Alias to use in code that doesn't instantiate struct io_tw_state */ 478 480 typedef struct io_tw_state io_tw_token_t; ··· 614 614 REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), 615 615 }; 616 616 617 - typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); 617 + struct io_tw_req { 618 + struct io_kiocb *req; 619 + }; 620 + 621 + typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw); 618 622 619 623 struct io_task_work { 620 624 struct llist_node node;

+1

include/linux/netdevice.h

··· 3417 3417 struct net_device *__dev_get_by_index(struct net *net, int ifindex); 3418 3418 struct net_device *netdev_get_by_index(struct net *net, int ifindex, 3419 3419 netdevice_tracker *tracker, gfp_t gfp); 3420 + struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); 3420 3421 struct net_device *netdev_get_by_name(struct net *net, const char *name, 3421 3422 netdevice_tracker *tracker, gfp_t gfp); 3422 3423 struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,

+3 -3

include/linux/socket.h

··· 468 468 int addrlen); 469 469 extern int __sys_listen(int fd, int backlog); 470 470 extern int __sys_listen_socket(struct socket *sock, int backlog); 471 + extern int do_getsockname(struct socket *sock, int peer, 472 + struct sockaddr __user *usockaddr, int __user *usockaddr_len); 471 473 extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, 472 - int __user *usockaddr_len); 473 - extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, 474 - int __user *usockaddr_len); 474 + int __user *usockaddr_len, int peer); 475 475 extern int __sys_socketpair(int family, int type, int protocol, 476 476 int __user *usockvec); 477 477 extern int __sys_shutdown_sock(struct socket *sock, int how);

+43

include/uapi/linux/io_uring.h

··· 231 231 */ 232 232 #define IORING_SETUP_CQE_MIXED (1U << 18) 233 233 234 + /* 235 + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have 236 + * a 128b opcode. 237 + */ 238 + #define IORING_SETUP_SQE_MIXED (1U << 19) 239 + 234 240 enum io_uring_op { 235 241 IORING_OP_NOP, 236 242 IORING_OP_READV, ··· 301 295 IORING_OP_READV_FIXED, 302 296 IORING_OP_WRITEV_FIXED, 303 297 IORING_OP_PIPE, 298 + IORING_OP_NOP128, 299 + IORING_OP_URING_CMD128, 304 300 305 301 /* this goes last, obviously */ 306 302 IORING_OP_LAST, ··· 697 689 /* query various aspects of io_uring, see linux/io_uring/query.h */ 698 690 IORING_REGISTER_QUERY = 35, 699 691 692 + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ 693 + IORING_REGISTER_ZCRX_CTRL = 36, 694 + 700 695 /* this goes last */ 701 696 IORING_REGISTER_LAST, 702 697 ··· 1009 998 SOCKET_URING_OP_GETSOCKOPT, 1010 999 SOCKET_URING_OP_SETSOCKOPT, 1011 1000 SOCKET_URING_OP_TX_TIMESTAMP, 1001 + SOCKET_URING_OP_GETSOCKNAME, 1012 1002 }; 1013 1003 1014 1004 /* ··· 1064 1052 __u64 __resv2[2]; 1065 1053 }; 1066 1054 1055 + enum zcrx_reg_flags { 1056 + ZCRX_REG_IMPORT = 1, 1057 + }; 1058 + 1067 1059 /* 1068 1060 * Argument for IORING_REGISTER_ZCRX_IFQ 1069 1061 */ ··· 1084 1068 __u32 zcrx_id; 1085 1069 __u32 __resv2; 1086 1070 __u64 __resv[3]; 1071 + }; 1072 + 1073 + enum zcrx_ctrl_op { 1074 + ZCRX_CTRL_FLUSH_RQ, 1075 + ZCRX_CTRL_EXPORT, 1076 + 1077 + __ZCRX_CTRL_LAST, 1078 + }; 1079 + 1080 + struct zcrx_ctrl_flush_rq { 1081 + __u64 __resv[6]; 1082 + }; 1083 + 1084 + struct zcrx_ctrl_export { 1085 + __u32 zcrx_fd; 1086 + __u32 __resv1[11]; 1087 + }; 1088 + 1089 + struct zcrx_ctrl { 1090 + __u32 zcrx_id; 1091 + __u32 op; /* see enum zcrx_ctrl_op */ 1092 + __u64 __resv[2]; 1093 + 1094 + union { 1095 + struct zcrx_ctrl_export zc_export; 1096 + struct zcrx_ctrl_flush_rq zc_flush; 1097 + }; 1087 1098 }; 1088 1099 1089 1100 #ifdef __cplusplus

+24

include/uapi/linux/io_uring/query.h

··· 18 18 19 19 enum { 20 20 IO_URING_QUERY_OPCODES = 0, 21 + IO_URING_QUERY_ZCRX = 1, 22 + IO_URING_QUERY_SCQ = 2, 21 23 22 24 __IO_URING_QUERY_MAX, 23 25 }; ··· 41 39 /* The number of available query opcodes */ 42 40 __u32 nr_query_opcodes; 43 41 __u32 __pad; 42 + }; 43 + 44 + struct io_uring_query_zcrx { 45 + /* Bitmask of supported ZCRX_REG_* flags, */ 46 + __u64 register_flags; 47 + /* Bitmask of all supported IORING_ZCRX_AREA_* flags */ 48 + __u64 area_flags; 49 + /* The number of supported ZCRX_CTRL_* opcodes */ 50 + __u32 nr_ctrl_opcodes; 51 + __u32 __resv1; 52 + /* The refill ring header size */ 53 + __u32 rq_hdr_size; 54 + /* The alignment for the header */ 55 + __u32 rq_hdr_alignment; 56 + __u64 __resv2; 57 + }; 58 + 59 + struct io_uring_query_scq { 60 + /* The SQ/CQ rings header size */ 61 + __u64 hdr_size; 62 + /* The alignment for the header */ 63 + __u64 hdr_alignment; 44 64 }; 45 65 46 66 #endif

+270

io_uring/cancel.c

··· 14 14 #include "filetable.h" 15 15 #include "io_uring.h" 16 16 #include "tctx.h" 17 + #include "sqpoll.h" 18 + #include "uring_cmd.h" 17 19 #include "poll.h" 18 20 #include "timeout.h" 19 21 #include "waitid.h" ··· 385 383 } 386 384 io_ring_submit_unlock(ctx, issue_flags); 387 385 return nr ?: -ENOENT; 386 + } 387 + 388 + static bool io_match_linked(struct io_kiocb *head) 389 + { 390 + struct io_kiocb *req; 391 + 392 + io_for_each_link(req, head) { 393 + if (req->flags & REQ_F_INFLIGHT) 394 + return true; 395 + } 396 + return false; 397 + } 398 + 399 + /* 400 + * As io_match_task() but protected against racing with linked timeouts. 401 + * User must not hold timeout_lock. 402 + */ 403 + bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 404 + bool cancel_all) 405 + { 406 + bool matched; 407 + 408 + if (tctx && head->tctx != tctx) 409 + return false; 410 + if (cancel_all) 411 + return true; 412 + 413 + if (head->flags & REQ_F_LINK_TIMEOUT) { 414 + struct io_ring_ctx *ctx = head->ctx; 415 + 416 + /* protect against races with linked timeouts */ 417 + raw_spin_lock_irq(&ctx->timeout_lock); 418 + matched = io_match_linked(head); 419 + raw_spin_unlock_irq(&ctx->timeout_lock); 420 + } else { 421 + matched = io_match_linked(head); 422 + } 423 + return matched; 424 + } 425 + 426 + void __io_uring_cancel(bool cancel_all) 427 + { 428 + io_uring_unreg_ringfd(); 429 + io_uring_cancel_generic(cancel_all, NULL); 430 + } 431 + 432 + struct io_task_cancel { 433 + struct io_uring_task *tctx; 434 + bool all; 435 + }; 436 + 437 + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 438 + { 439 + struct io_kiocb *req = container_of(work, struct io_kiocb, work); 440 + struct io_task_cancel *cancel = data; 441 + 442 + return io_match_task_safe(req, cancel->tctx, cancel->all); 443 + } 444 + 445 + static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 446 + struct io_uring_task *tctx, 447 + bool cancel_all) 448 + { 449 + struct io_defer_entry *de; 450 + LIST_HEAD(list); 451 + 452 + list_for_each_entry_reverse(de, &ctx->defer_list, list) { 453 + if (io_match_task_safe(de->req, tctx, cancel_all)) { 454 + list_cut_position(&list, &ctx->defer_list, &de->list); 455 + break; 456 + } 457 + } 458 + if (list_empty(&list)) 459 + return false; 460 + 461 + while (!list_empty(&list)) { 462 + de = list_first_entry(&list, struct io_defer_entry, list); 463 + list_del_init(&de->list); 464 + ctx->nr_drained -= io_linked_nr(de->req); 465 + io_req_task_queue_fail(de->req, -ECANCELED); 466 + kfree(de); 467 + } 468 + return true; 469 + } 470 + 471 + __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 472 + { 473 + struct io_kiocb *req = container_of(work, struct io_kiocb, work); 474 + 475 + return req->ctx == data; 476 + } 477 + 478 + static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 479 + { 480 + struct io_tctx_node *node; 481 + enum io_wq_cancel cret; 482 + bool ret = false; 483 + 484 + mutex_lock(&ctx->uring_lock); 485 + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 486 + struct io_uring_task *tctx = node->task->io_uring; 487 + 488 + /* 489 + * io_wq will stay alive while we hold uring_lock, because it's 490 + * killed after ctx nodes, which requires to take the lock. 491 + */ 492 + if (!tctx || !tctx->io_wq) 493 + continue; 494 + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 495 + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 496 + } 497 + mutex_unlock(&ctx->uring_lock); 498 + 499 + return ret; 500 + } 501 + 502 + __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 503 + struct io_uring_task *tctx, 504 + bool cancel_all, bool is_sqpoll_thread) 505 + { 506 + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; 507 + enum io_wq_cancel cret; 508 + bool ret = false; 509 + 510 + /* set it so io_req_local_work_add() would wake us up */ 511 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 512 + atomic_set(&ctx->cq_wait_nr, 1); 513 + smp_mb(); 514 + } 515 + 516 + /* failed during ring init, it couldn't have issued any requests */ 517 + if (!ctx->rings) 518 + return false; 519 + 520 + if (!tctx) { 521 + ret |= io_uring_try_cancel_iowq(ctx); 522 + } else if (tctx->io_wq) { 523 + /* 524 + * Cancels requests of all rings, not only @ctx, but 525 + * it's fine as the task is in exit/exec. 526 + */ 527 + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 528 + &cancel, true); 529 + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 530 + } 531 + 532 + /* SQPOLL thread does its own polling */ 533 + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 534 + is_sqpoll_thread) { 535 + while (!wq_list_empty(&ctx->iopoll_list)) { 536 + io_iopoll_try_reap_events(ctx); 537 + ret = true; 538 + cond_resched(); 539 + } 540 + } 541 + 542 + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 543 + io_allowed_defer_tw_run(ctx)) 544 + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; 545 + mutex_lock(&ctx->uring_lock); 546 + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 547 + ret |= io_poll_remove_all(ctx, tctx, cancel_all); 548 + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); 549 + ret |= io_futex_remove_all(ctx, tctx, cancel_all); 550 + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); 551 + mutex_unlock(&ctx->uring_lock); 552 + ret |= io_kill_timeouts(ctx, tctx, cancel_all); 553 + if (tctx) 554 + ret |= io_run_task_work() > 0; 555 + else 556 + ret |= flush_delayed_work(&ctx->fallback_work); 557 + return ret; 558 + } 559 + 560 + static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 561 + { 562 + if (tracked) 563 + return atomic_read(&tctx->inflight_tracked); 564 + return percpu_counter_sum(&tctx->inflight); 565 + } 566 + 567 + /* 568 + * Find any io_uring ctx that this task has registered or done IO on, and cancel 569 + * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 570 + */ 571 + __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 572 + { 573 + struct io_uring_task *tctx = current->io_uring; 574 + struct io_ring_ctx *ctx; 575 + struct io_tctx_node *node; 576 + unsigned long index; 577 + s64 inflight; 578 + DEFINE_WAIT(wait); 579 + 580 + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); 581 + 582 + if (!current->io_uring) 583 + return; 584 + if (tctx->io_wq) 585 + io_wq_exit_start(tctx->io_wq); 586 + 587 + atomic_inc(&tctx->in_cancel); 588 + do { 589 + bool loop = false; 590 + 591 + io_uring_drop_tctx_refs(current); 592 + if (!tctx_inflight(tctx, !cancel_all)) 593 + break; 594 + 595 + /* read completions before cancelations */ 596 + inflight = tctx_inflight(tctx, false); 597 + if (!inflight) 598 + break; 599 + 600 + if (!sqd) { 601 + xa_for_each(&tctx->xa, index, node) { 602 + /* sqpoll task will cancel all its requests */ 603 + if (node->ctx->sq_data) 604 + continue; 605 + loop |= io_uring_try_cancel_requests(node->ctx, 606 + current->io_uring, 607 + cancel_all, 608 + false); 609 + } 610 + } else { 611 + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 612 + loop |= io_uring_try_cancel_requests(ctx, 613 + current->io_uring, 614 + cancel_all, 615 + true); 616 + } 617 + 618 + if (loop) { 619 + cond_resched(); 620 + continue; 621 + } 622 + 623 + prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 624 + io_run_task_work(); 625 + io_uring_drop_tctx_refs(current); 626 + xa_for_each(&tctx->xa, index, node) { 627 + if (io_local_work_pending(node->ctx)) { 628 + WARN_ON_ONCE(node->ctx->submitter_task && 629 + node->ctx->submitter_task != current); 630 + goto end_wait; 631 + } 632 + } 633 + /* 634 + * If we've seen completions, retry without waiting. This 635 + * avoids a race where a completion comes in before we did 636 + * prepare_to_wait(). 637 + */ 638 + if (inflight == tctx_inflight(tctx, !cancel_all)) 639 + schedule(); 640 + end_wait: 641 + finish_wait(&tctx->wait, &wait); 642 + } while (1); 643 + 644 + io_uring_clean_tctx(tctx); 645 + if (cancel_all) { 646 + /* 647 + * We shouldn't run task_works after cancel, so just leave 648 + * ->in_cancel set for normal exit. 649 + */ 650 + atomic_dec(&tctx->in_cancel); 651 + /* for exec all current's requests should be gone, kill tctx */ 652 + __io_uring_free(current); 653 + } 388 654 }

+7 -1

io_uring/cancel.h

··· 23 23 24 24 int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); 25 25 bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); 26 + bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 27 + bool cancel_all); 26 28 27 29 bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 28 30 struct hlist_head *list, bool cancel_all, 29 31 bool (*cancel)(struct io_kiocb *)); 30 - 31 32 int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, 32 33 unsigned int issue_flags, struct hlist_head *list, 33 34 bool (*cancel)(struct io_kiocb *)); 35 + __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 36 + struct io_uring_task *tctx, 37 + bool cancel_all, bool is_sqpoll_thread); 38 + __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 39 + __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data); 34 40 35 41 static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) 36 42 {

+22

io_uring/cmd_net.c

··· 132 132 return -EAGAIN; 133 133 } 134 134 135 + static int io_uring_cmd_getsockname(struct socket *sock, 136 + struct io_uring_cmd *cmd, 137 + unsigned int issue_flags) 138 + { 139 + const struct io_uring_sqe *sqe = cmd->sqe; 140 + struct sockaddr __user *uaddr; 141 + unsigned int peer; 142 + int __user *ulen; 143 + 144 + if (sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags) 145 + return -EINVAL; 146 + 147 + uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr)); 148 + ulen = u64_to_user_ptr(sqe->addr3); 149 + peer = READ_ONCE(sqe->optlen); 150 + if (peer > 1) 151 + return -EINVAL; 152 + return do_getsockname(sock, peer, uaddr, ulen); 153 + } 154 + 135 155 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) 136 156 { 137 157 struct socket *sock = cmd->file->private_data; ··· 179 159 return io_uring_cmd_setsockopt(sock, cmd, issue_flags); 180 160 case SOCKET_URING_OP_TX_TIMESTAMP: 181 161 return io_uring_cmd_timestamp(sock, cmd, issue_flags); 162 + case SOCKET_URING_OP_GETSOCKNAME: 163 + return io_uring_cmd_getsockname(sock, cmd, issue_flags); 182 164 default: 183 165 return -EOPNOTSUPP; 184 166 }

+32 -5

io_uring/fdinfo.c

··· 5 5 #include <linux/file.h> 6 6 #include <linux/proc_fs.h> 7 7 #include <linux/seq_file.h> 8 + #include <linux/nospec.h> 8 9 #include <linux/io_uring.h> 9 10 10 11 #include <uapi/linux/io_uring.h> ··· 15 14 #include "fdinfo.h" 16 15 #include "cancel.h" 17 16 #include "rsrc.h" 17 + #include "opdef.h" 18 18 19 19 #ifdef CONFIG_NET_RX_BUSY_POLL 20 20 static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, ··· 95 93 unsigned int entry = i + sq_head; 96 94 struct io_uring_sqe *sqe; 97 95 unsigned int sq_idx; 96 + bool sqe128 = false; 97 + u8 opcode; 98 98 99 99 if (ctx->flags & IORING_SETUP_NO_SQARRAY) 100 - break; 101 - sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 100 + sq_idx = entry & sq_mask; 101 + else 102 + sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); 102 103 if (sq_idx > sq_mask) 103 104 continue; 105 + 104 106 sqe = &ctx->sq_sqes[sq_idx << sq_shift]; 107 + opcode = READ_ONCE(sqe->opcode); 108 + if (opcode >= IORING_OP_LAST) 109 + continue; 110 + opcode = array_index_nospec(opcode, IORING_OP_LAST); 111 + if (sq_shift) { 112 + sqe128 = true; 113 + } else if (io_issue_defs[opcode].is_128) { 114 + if (!(ctx->flags & IORING_SETUP_SQE_MIXED)) { 115 + seq_printf(m, 116 + "%5u: invalid sqe, 128B entry on non-mixed sq\n", 117 + sq_idx); 118 + break; 119 + } 120 + if ((++sq_head & sq_mask) == 0) { 121 + seq_printf(m, 122 + "%5u: corrupted sqe, wrapping 128B entry\n", 123 + sq_idx); 124 + break; 125 + } 126 + sqe128 = true; 127 + } 105 128 seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " 106 129 "addr:0x%llx, rw_flags:0x%x, buf_index:%d " 107 130 "user_data:%llu", 108 - sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd, 131 + sq_idx, io_uring_get_opcode(opcode), sqe->fd, 109 132 sqe->flags, (unsigned long long) sqe->off, 110 133 (unsigned long long) sqe->addr, sqe->rw_flags, 111 134 sqe->buf_index, sqe->user_data); 112 - if (sq_shift) { 135 + if (sqe128) { 113 136 u64 *sqeb = (void *) (sqe + 1); 114 137 int size = sizeof(struct io_uring_sqe) / sizeof(u64); 115 138 int j; ··· 155 128 cqe = &r->cqes[(cq_head & cq_mask)]; 156 129 if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) 157 130 cqe32 = true; 158 - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", 131 + seq_printf(m, "%5u: user_data:%llu, res:%d, flags:%x", 159 132 cq_head & cq_mask, cqe->user_data, cqe->res, 160 133 cqe->flags); 161 134 if (cqe32)

+31 -26

io_uring/futex.c

··· 17 17 void __user *uaddr; 18 18 unsigned long futex_val; 19 19 unsigned long futex_mask; 20 - unsigned long futexv_owned; 21 20 u32 futex_flags; 22 21 unsigned int futex_nr; 23 22 bool futexv_unqueued; ··· 25 26 struct io_futex_data { 26 27 struct futex_q q; 27 28 struct io_kiocb *req; 29 + }; 30 + 31 + struct io_futexv_data { 32 + unsigned long owned; 33 + struct futex_vector futexv[]; 28 34 }; 29 35 30 36 #define IO_FUTEX_ALLOC_CACHE_MAX 32 ··· 45 41 io_alloc_cache_free(&ctx->futex_cache, kfree); 46 42 } 47 43 48 - static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 44 + static void __io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) 49 45 { 50 - hlist_del_init(&req->hash_node); 51 - io_req_task_complete(req, tw); 46 + hlist_del_init(&tw_req.req->hash_node); 47 + io_req_task_complete(tw_req, tw); 52 48 } 53 49 54 - static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 50 + static void io_futex_complete(struct io_tw_req tw_req, io_tw_token_t tw) 55 51 { 52 + struct io_kiocb *req = tw_req.req; 56 53 struct io_ring_ctx *ctx = req->ctx; 57 54 58 55 io_tw_lock(ctx, tw); 59 56 io_cache_free(&ctx->futex_cache, req->async_data); 60 57 io_req_async_data_clear(req, 0); 61 - __io_futex_complete(req, tw); 58 + __io_futex_complete(tw_req, tw); 62 59 } 63 60 64 - static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) 61 + static void io_futexv_complete(struct io_tw_req tw_req, io_tw_token_t tw) 65 62 { 63 + struct io_kiocb *req = tw_req.req; 66 64 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 67 - struct futex_vector *futexv = req->async_data; 65 + struct io_futexv_data *ifd = req->async_data; 68 66 69 67 io_tw_lock(req->ctx, tw); 70 68 71 69 if (!iof->futexv_unqueued) { 72 70 int res; 73 71 74 - res = futex_unqueue_multiple(futexv, iof->futex_nr); 72 + res = futex_unqueue_multiple(ifd->futexv, iof->futex_nr); 75 73 if (res != -1) 76 74 io_req_set_res(req, res, 0); 77 75 } 78 76 79 77 io_req_async_data_free(req); 80 - __io_futex_complete(req, tw); 78 + __io_futex_complete(tw_req, tw); 81 79 } 82 80 83 - static bool io_futexv_claim(struct io_futex *iof) 81 + static bool io_futexv_claim(struct io_futexv_data *ifd) 84 82 { 85 - if (test_bit(0, &iof->futexv_owned) || 86 - test_and_set_bit_lock(0, &iof->futexv_owned)) 83 + if (test_bit(0, &ifd->owned) || test_and_set_bit_lock(0, &ifd->owned)) 87 84 return false; 88 85 return true; 89 86 } ··· 99 94 return false; 100 95 req->io_task_work.func = io_futex_complete; 101 96 } else { 102 - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 97 + struct io_futexv_data *ifd = req->async_data; 103 98 104 - if (!io_futexv_claim(iof)) 99 + if (!io_futexv_claim(ifd)) 105 100 return false; 106 101 req->io_task_work.func = io_futexv_complete; 107 102 } ··· 157 152 static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q) 158 153 { 159 154 struct io_kiocb *req = q->wake_data; 160 - struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 155 + struct io_futexv_data *ifd = req->async_data; 161 156 162 - if (!io_futexv_claim(iof)) 157 + if (!io_futexv_claim(ifd)) 163 158 return; 164 159 if (unlikely(!__futex_wake_mark(q))) 165 160 return; ··· 172 167 int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 173 168 { 174 169 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 175 - struct futex_vector *futexv; 170 + struct io_futexv_data *ifd; 176 171 int ret; 177 172 178 173 /* No flags or mask supported for waitv */ ··· 185 180 if (!iof->futex_nr || iof->futex_nr > FUTEX_WAITV_MAX) 186 181 return -EINVAL; 187 182 188 - futexv = kcalloc(iof->futex_nr, sizeof(*futexv), GFP_KERNEL); 189 - if (!futexv) 183 + ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr), 184 + GFP_KERNEL); 185 + if (!ifd) 190 186 return -ENOMEM; 191 187 192 - ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, 188 + ret = futex_parse_waitv(ifd->futexv, iof->uaddr, iof->futex_nr, 193 189 io_futex_wakev_fn, req); 194 190 if (ret) { 195 - kfree(futexv); 191 + kfree(ifd); 196 192 return ret; 197 193 } 198 194 199 195 /* Mark as inflight, so file exit cancelation will find it */ 200 196 io_req_track_inflight(req); 201 - iof->futexv_owned = 0; 202 197 iof->futexv_unqueued = 0; 203 198 req->flags |= REQ_F_ASYNC_DATA; 204 - req->async_data = futexv; 199 + req->async_data = ifd; 205 200 return 0; 206 201 } 207 202 ··· 221 216 int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) 222 217 { 223 218 struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); 224 - struct futex_vector *futexv = req->async_data; 219 + struct io_futexv_data *ifd = req->async_data; 225 220 struct io_ring_ctx *ctx = req->ctx; 226 221 int ret, woken = -1; 227 222 228 223 io_ring_submit_lock(ctx, issue_flags); 229 224 230 - ret = futex_wait_multiple_setup(futexv, iof->futex_nr, &woken); 225 + ret = futex_wait_multiple_setup(ifd->futexv, iof->futex_nr, &woken); 231 226 232 227 /* 233 228 * Error case, ret is < 0. Mark the request as failed.

+163 -386

io_uring/io_uring.c

··· 124 124 #define IO_REQ_ALLOC_BATCH 8 125 125 #define IO_LOCAL_TW_DEFAULT_MAX 20 126 126 127 - struct io_defer_entry { 128 - struct list_head list; 129 - struct io_kiocb *req; 130 - }; 131 - 132 127 /* requests with any of those set should undergo io_disarm_next() */ 133 128 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 134 129 ··· 134 139 #define IO_CQ_WAKE_INIT (-1U) 135 140 /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 136 141 #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 137 - 138 - static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 139 - struct io_uring_task *tctx, 140 - bool cancel_all, 141 - bool is_sqpoll_thread); 142 142 143 143 static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); 144 144 static void __io_req_caches_free(struct io_ring_ctx *ctx); ··· 197 207 return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 198 208 } 199 209 200 - static bool io_match_linked(struct io_kiocb *head) 201 - { 202 - struct io_kiocb *req; 203 - 204 - io_for_each_link(req, head) { 205 - if (req->flags & REQ_F_INFLIGHT) 206 - return true; 207 - } 208 - return false; 209 - } 210 - 211 - /* 212 - * As io_match_task() but protected against racing with linked timeouts. 213 - * User must not hold timeout_lock. 214 - */ 215 - bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 216 - bool cancel_all) 217 - { 218 - bool matched; 219 - 220 - if (tctx && head->tctx != tctx) 221 - return false; 222 - if (cancel_all) 223 - return true; 224 - 225 - if (head->flags & REQ_F_LINK_TIMEOUT) { 226 - struct io_ring_ctx *ctx = head->ctx; 227 - 228 - /* protect against races with linked timeouts */ 229 - raw_spin_lock_irq(&ctx->timeout_lock); 230 - matched = io_match_linked(head); 231 - raw_spin_unlock_irq(&ctx->timeout_lock); 232 - } else { 233 - matched = io_match_linked(head); 234 - } 235 - return matched; 236 - } 237 - 238 210 static inline void req_fail_link_node(struct io_kiocb *req, int res) 239 211 { 240 212 req_set_fail(req); ··· 217 265 complete(&ctx->ref_comp); 218 266 } 219 267 268 + /* 269 + * Terminate the request if either of these conditions are true: 270 + * 271 + * 1) It's being executed by the original task, but that task is marked 272 + * with PF_EXITING as it's exiting. 273 + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 274 + * our fallback task_work. 275 + * 3) The ring has been closed and is going away. 276 + */ 277 + static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 278 + { 279 + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); 280 + } 281 + 220 282 static __cold void io_fallback_req_func(struct work_struct *work) 221 283 { 222 284 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, ··· 241 275 242 276 percpu_ref_get(&ctx->refs); 243 277 mutex_lock(&ctx->uring_lock); 278 + ts.cancel = io_should_terminate_tw(ctx); 244 279 llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 245 - req->io_task_work.func(req, ts); 280 + req->io_task_work.func((struct io_tw_req){req}, ts); 246 281 io_submit_flush_completions(ctx); 247 282 mutex_unlock(&ctx->uring_lock); 248 283 percpu_ref_put(&ctx->refs); ··· 491 524 io_wq_enqueue(tctx->io_wq, &req->work); 492 525 } 493 526 494 - static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) 527 + static void io_req_queue_iowq_tw(struct io_tw_req tw_req, io_tw_token_t tw) 495 528 { 496 - io_queue_iowq(req); 529 + io_queue_iowq(tw_req.req); 497 530 } 498 531 499 532 void io_req_queue_iowq(struct io_kiocb *req) ··· 502 535 io_req_task_work_add(req); 503 536 } 504 537 505 - static unsigned io_linked_nr(struct io_kiocb *req) 538 + unsigned io_linked_nr(struct io_kiocb *req) 506 539 { 507 540 struct io_kiocb *tmp; 508 541 unsigned nr = 0; ··· 673 706 tctx->cached_refs += refill; 674 707 } 675 708 676 - static __cold void io_uring_drop_tctx_refs(struct task_struct *task) 709 + __cold void io_uring_drop_tctx_refs(struct task_struct *task) 677 710 { 678 711 struct io_uring_task *tctx = task->io_uring; 679 712 unsigned int refs = tctx->cached_refs; ··· 884 917 } 885 918 886 919 /* 887 - * Must be called from inline task_work so we now a flush will happen later, 920 + * Must be called from inline task_work so we know a flush will happen later, 888 921 * and obviously with ctx->uring_lock held (tw always has that). 889 922 */ 890 923 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) ··· 1116 1149 ctx = req->ctx; 1117 1150 mutex_lock(&ctx->uring_lock); 1118 1151 percpu_ref_get(&ctx->refs); 1152 + ts.cancel = io_should_terminate_tw(ctx); 1119 1153 } 1120 1154 INDIRECT_CALL_2(req->io_task_work.func, 1121 1155 io_poll_task_func, io_req_rw_complete, 1122 - req, ts); 1156 + (struct io_tw_req){req}, ts); 1123 1157 node = next; 1124 1158 (*count)++; 1125 1159 if (unlikely(need_resched())) { ··· 1175 1207 { 1176 1208 struct llist_node *node; 1177 1209 1178 - if (unlikely(current->flags & PF_EXITING)) { 1179 - io_fallback_tw(tctx, true); 1180 - return NULL; 1181 - } 1182 - 1183 1210 node = llist_del_all(&tctx->task_list); 1184 1211 if (node) { 1185 1212 node = llist_reverse_order(node); ··· 1211 1248 BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 1212 1249 1213 1250 /* 1214 - * We don't know how many reuqests is there in the link and whether 1251 + * We don't know how many requests there are in the link and whether 1215 1252 * they can even be queued lazily, fall back to non-lazy. 1216 1253 */ 1217 1254 if (req->flags & IO_REQ_LINK_FLAGS) ··· 1343 1380 io_task_work.node); 1344 1381 INDIRECT_CALL_2(req->io_task_work.func, 1345 1382 io_poll_task_func, io_req_rw_complete, 1346 - req, tw); 1383 + (struct io_tw_req){req}, tw); 1347 1384 *node = next; 1348 1385 if (++ret >= events) 1349 1386 break; ··· 1364 1401 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1365 1402 atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1366 1403 again: 1404 + tw.cancel = io_should_terminate_tw(ctx); 1367 1405 min_events -= ret; 1368 1406 ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 1369 1407 if (ctx->retry_llist.first) ··· 1401 1437 max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1402 1438 } 1403 1439 1404 - static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, 1405 - int max_events) 1440 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) 1406 1441 { 1407 1442 struct io_tw_state ts = {}; 1408 1443 int ret; ··· 1412 1449 return ret; 1413 1450 } 1414 1451 1415 - static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) 1452 + static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) 1416 1453 { 1454 + struct io_kiocb *req = tw_req.req; 1455 + 1417 1456 io_tw_lock(req->ctx, tw); 1418 1457 io_req_defer_failed(req, req->cqe.res); 1419 1458 } 1420 1459 1421 - void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) 1460 + void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw) 1422 1461 { 1462 + struct io_kiocb *req = tw_req.req; 1423 1463 struct io_ring_ctx *ctx = req->ctx; 1424 1464 1425 1465 io_tw_lock(ctx, tw); 1426 - if (unlikely(io_should_terminate_tw(ctx))) 1466 + if (unlikely(tw.cancel)) 1427 1467 io_req_defer_failed(req, -EFAULT); 1428 1468 else if (req->flags & REQ_F_FORCE_ASYNC) 1429 1469 io_queue_iowq(req); ··· 1555 1589 * We can't just wait for polled events to come to us, we have to actively 1556 1590 * find and complete them. 1557 1591 */ 1558 - static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 1592 + __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) 1559 1593 { 1560 1594 if (!(ctx->flags & IORING_SETUP_IOPOLL)) 1561 1595 return; ··· 1658 1692 return 0; 1659 1693 } 1660 1694 1661 - void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) 1695 + void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw) 1662 1696 { 1663 - io_req_complete_defer(req); 1697 + io_req_complete_defer(tw_req.req); 1664 1698 } 1665 1699 1666 1700 /* ··· 2119 2153 } 2120 2154 2121 2155 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 2122 - const struct io_uring_sqe *sqe) 2156 + const struct io_uring_sqe *sqe, unsigned int *left) 2123 2157 __must_hold(&ctx->uring_lock) 2124 2158 { 2125 2159 const struct io_issue_def *def; ··· 2145 2179 opcode = array_index_nospec(opcode, IORING_OP_LAST); 2146 2180 2147 2181 def = &io_issue_defs[opcode]; 2182 + if (def->is_128 && !(ctx->flags & IORING_SETUP_SQE128)) { 2183 + /* 2184 + * A 128b op on a non-128b SQ requires mixed SQE support as 2185 + * well as 2 contiguous entries. 2186 + */ 2187 + if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || 2188 + !(ctx->cached_sq_head & (ctx->sq_entries - 1))) 2189 + return io_init_fail_req(req, -EINVAL); 2190 + /* 2191 + * A 128b operation on a mixed SQ uses two entries, so we have 2192 + * to increment the head and cached refs, and decrement what's 2193 + * left. 2194 + */ 2195 + current->io_uring->cached_refs++; 2196 + ctx->cached_sq_head++; 2197 + (*left)--; 2198 + } 2199 + 2148 2200 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 2149 2201 /* enforce forwards compatibility on users */ 2150 2202 if (sqe_flags & ~SQE_VALID_FLAGS) ··· 2272 2288 } 2273 2289 2274 2290 static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 2275 - const struct io_uring_sqe *sqe) 2291 + const struct io_uring_sqe *sqe, unsigned int *left) 2276 2292 __must_hold(&ctx->uring_lock) 2277 2293 { 2278 2294 struct io_submit_link *link = &ctx->submit_state.link; 2279 2295 int ret; 2280 2296 2281 - ret = io_init_req(ctx, req, sqe); 2297 + ret = io_init_req(ctx, req, sqe, left); 2282 2298 if (unlikely(ret)) 2283 2299 return io_submit_fail_init(sqe, req, ret); 2284 2300 ··· 2408 2424 unsigned int left; 2409 2425 int ret; 2410 2426 2427 + entries = min(nr, entries); 2411 2428 if (unlikely(!entries)) 2412 2429 return 0; 2413 - /* make sure SQ entry isn't read before tail */ 2414 - ret = left = min(nr, entries); 2430 + 2431 + ret = left = entries; 2415 2432 io_get_task_refs(left); 2416 2433 io_submit_state_start(&ctx->submit_state, left); 2417 2434 ··· 2431 2446 * Continue submitting even for sqe failure if the 2432 2447 * ring was setup with IORING_SETUP_SUBMIT_ALL 2433 2448 */ 2434 - if (unlikely(io_submit_sqe(ctx, req, sqe)) && 2449 + if (unlikely(io_submit_sqe(ctx, req, sqe, &left)) && 2435 2450 !(ctx->flags & IORING_SETUP_SUBMIT_ALL)) { 2436 2451 left--; 2437 2452 break; ··· 2753 2768 2754 2769 static void io_rings_free(struct io_ring_ctx *ctx) 2755 2770 { 2756 - io_free_region(ctx, &ctx->sq_region); 2757 - io_free_region(ctx, &ctx->ring_region); 2771 + io_free_region(ctx->user, &ctx->sq_region); 2772 + io_free_region(ctx->user, &ctx->ring_region); 2758 2773 ctx->rings = NULL; 2759 2774 ctx->sq_sqes = NULL; 2760 2775 } 2761 2776 2762 - unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 2763 - unsigned int cq_entries, size_t *sq_offset) 2777 + static int rings_size(unsigned int flags, unsigned int sq_entries, 2778 + unsigned int cq_entries, struct io_rings_layout *rl) 2764 2779 { 2765 2780 struct io_rings *rings; 2766 - size_t off, sq_array_size; 2781 + size_t sqe_size; 2782 + size_t off; 2767 2783 2768 - off = struct_size(rings, cqes, cq_entries); 2769 - if (off == SIZE_MAX) 2770 - return SIZE_MAX; 2771 - if (flags & IORING_SETUP_CQE32) { 2772 - if (check_shl_overflow(off, 1, &off)) 2773 - return SIZE_MAX; 2774 - } 2775 2784 if (flags & IORING_SETUP_CQE_MIXED) { 2776 2785 if (cq_entries < 2) 2777 - return SIZE_MAX; 2786 + return -EOVERFLOW; 2778 2787 } 2788 + if (flags & IORING_SETUP_SQE_MIXED) { 2789 + if (sq_entries < 2) 2790 + return -EOVERFLOW; 2791 + } 2792 + 2793 + rl->sq_array_offset = SIZE_MAX; 2794 + 2795 + sqe_size = sizeof(struct io_uring_sqe); 2796 + if (flags & IORING_SETUP_SQE128) 2797 + sqe_size *= 2; 2798 + 2799 + rl->sq_size = array_size(sqe_size, sq_entries); 2800 + if (rl->sq_size == SIZE_MAX) 2801 + return -EOVERFLOW; 2802 + 2803 + off = struct_size(rings, cqes, cq_entries); 2804 + if (flags & IORING_SETUP_CQE32) 2805 + off = size_mul(off, 2); 2806 + if (off == SIZE_MAX) 2807 + return -EOVERFLOW; 2779 2808 2780 2809 #ifdef CONFIG_SMP 2781 2810 off = ALIGN(off, SMP_CACHE_BYTES); 2782 2811 if (off == 0) 2783 - return SIZE_MAX; 2812 + return -EOVERFLOW; 2784 2813 #endif 2785 2814 2786 - if (flags & IORING_SETUP_NO_SQARRAY) { 2787 - *sq_offset = SIZE_MAX; 2788 - return off; 2815 + if (!(flags & IORING_SETUP_NO_SQARRAY)) { 2816 + size_t sq_array_size; 2817 + 2818 + rl->sq_array_offset = off; 2819 + 2820 + sq_array_size = array_size(sizeof(u32), sq_entries); 2821 + off = size_add(off, sq_array_size); 2822 + if (off == SIZE_MAX) 2823 + return -EOVERFLOW; 2789 2824 } 2790 2825 2791 - *sq_offset = off; 2792 - 2793 - sq_array_size = array_size(sizeof(u32), sq_entries); 2794 - if (sq_array_size == SIZE_MAX) 2795 - return SIZE_MAX; 2796 - 2797 - if (check_add_overflow(off, sq_array_size, &off)) 2798 - return SIZE_MAX; 2799 - 2800 - return off; 2826 + rl->rings_size = off; 2827 + return 0; 2801 2828 } 2802 2829 2803 2830 static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) ··· 2847 2850 io_eventfd_unregister(ctx); 2848 2851 io_free_alloc_caches(ctx); 2849 2852 io_destroy_buffers(ctx); 2850 - io_free_region(ctx, &ctx->param_region); 2853 + io_free_region(ctx->user, &ctx->param_region); 2851 2854 mutex_unlock(&ctx->uring_lock); 2852 2855 if (ctx->sq_creds) 2853 2856 put_cred(ctx->sq_creds); ··· 2977 2980 complete(&work->completion); 2978 2981 } 2979 2982 2980 - static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) 2981 - { 2982 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 2983 - 2984 - return req->ctx == data; 2985 - } 2986 - 2987 2983 static __cold void io_ring_exit_work(struct work_struct *work) 2988 2984 { 2989 2985 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); ··· 2996 3006 if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { 2997 3007 mutex_lock(&ctx->uring_lock); 2998 3008 io_cqring_overflow_kill(ctx); 2999 - mutex_unlock(&ctx->uring_lock); 3000 - } 3001 - if (!xa_empty(&ctx->zcrx_ctxs)) { 3002 - mutex_lock(&ctx->uring_lock); 3003 - io_shutdown_zcrx_ifqs(ctx); 3004 3009 mutex_unlock(&ctx->uring_lock); 3005 3010 } 3006 3011 ··· 3103 3118 file->private_data = NULL; 3104 3119 io_ring_ctx_wait_and_kill(ctx); 3105 3120 return 0; 3106 - } 3107 - 3108 - struct io_task_cancel { 3109 - struct io_uring_task *tctx; 3110 - bool all; 3111 - }; 3112 - 3113 - static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 3114 - { 3115 - struct io_kiocb *req = container_of(work, struct io_kiocb, work); 3116 - struct io_task_cancel *cancel = data; 3117 - 3118 - return io_match_task_safe(req, cancel->tctx, cancel->all); 3119 - } 3120 - 3121 - static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, 3122 - struct io_uring_task *tctx, 3123 - bool cancel_all) 3124 - { 3125 - struct io_defer_entry *de; 3126 - LIST_HEAD(list); 3127 - 3128 - list_for_each_entry_reverse(de, &ctx->defer_list, list) { 3129 - if (io_match_task_safe(de->req, tctx, cancel_all)) { 3130 - list_cut_position(&list, &ctx->defer_list, &de->list); 3131 - break; 3132 - } 3133 - } 3134 - if (list_empty(&list)) 3135 - return false; 3136 - 3137 - while (!list_empty(&list)) { 3138 - de = list_first_entry(&list, struct io_defer_entry, list); 3139 - list_del_init(&de->list); 3140 - ctx->nr_drained -= io_linked_nr(de->req); 3141 - io_req_task_queue_fail(de->req, -ECANCELED); 3142 - kfree(de); 3143 - } 3144 - return true; 3145 - } 3146 - 3147 - static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) 3148 - { 3149 - struct io_tctx_node *node; 3150 - enum io_wq_cancel cret; 3151 - bool ret = false; 3152 - 3153 - mutex_lock(&ctx->uring_lock); 3154 - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { 3155 - struct io_uring_task *tctx = node->task->io_uring; 3156 - 3157 - /* 3158 - * io_wq will stay alive while we hold uring_lock, because it's 3159 - * killed after ctx nodes, which requires to take the lock. 3160 - */ 3161 - if (!tctx || !tctx->io_wq) 3162 - continue; 3163 - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_ctx_cb, ctx, true); 3164 - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 3165 - } 3166 - mutex_unlock(&ctx->uring_lock); 3167 - 3168 - return ret; 3169 - } 3170 - 3171 - static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, 3172 - struct io_uring_task *tctx, 3173 - bool cancel_all, 3174 - bool is_sqpoll_thread) 3175 - { 3176 - struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; 3177 - enum io_wq_cancel cret; 3178 - bool ret = false; 3179 - 3180 - /* set it so io_req_local_work_add() would wake us up */ 3181 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 3182 - atomic_set(&ctx->cq_wait_nr, 1); 3183 - smp_mb(); 3184 - } 3185 - 3186 - /* failed during ring init, it couldn't have issued any requests */ 3187 - if (!ctx->rings) 3188 - return false; 3189 - 3190 - if (!tctx) { 3191 - ret |= io_uring_try_cancel_iowq(ctx); 3192 - } else if (tctx->io_wq) { 3193 - /* 3194 - * Cancels requests of all rings, not only @ctx, but 3195 - * it's fine as the task is in exit/exec. 3196 - */ 3197 - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, 3198 - &cancel, true); 3199 - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); 3200 - } 3201 - 3202 - /* SQPOLL thread does its own polling */ 3203 - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 3204 - is_sqpoll_thread) { 3205 - while (!wq_list_empty(&ctx->iopoll_list)) { 3206 - io_iopoll_try_reap_events(ctx); 3207 - ret = true; 3208 - cond_resched(); 3209 - } 3210 - } 3211 - 3212 - if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3213 - io_allowed_defer_tw_run(ctx)) 3214 - ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; 3215 - mutex_lock(&ctx->uring_lock); 3216 - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 3217 - ret |= io_poll_remove_all(ctx, tctx, cancel_all); 3218 - ret |= io_waitid_remove_all(ctx, tctx, cancel_all); 3219 - ret |= io_futex_remove_all(ctx, tctx, cancel_all); 3220 - ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); 3221 - mutex_unlock(&ctx->uring_lock); 3222 - ret |= io_kill_timeouts(ctx, tctx, cancel_all); 3223 - if (tctx) 3224 - ret |= io_run_task_work() > 0; 3225 - else 3226 - ret |= flush_delayed_work(&ctx->fallback_work); 3227 - return ret; 3228 - } 3229 - 3230 - static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) 3231 - { 3232 - if (tracked) 3233 - return atomic_read(&tctx->inflight_tracked); 3234 - return percpu_counter_sum(&tctx->inflight); 3235 - } 3236 - 3237 - /* 3238 - * Find any io_uring ctx that this task has registered or done IO on, and cancel 3239 - * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. 3240 - */ 3241 - __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) 3242 - { 3243 - struct io_uring_task *tctx = current->io_uring; 3244 - struct io_ring_ctx *ctx; 3245 - struct io_tctx_node *node; 3246 - unsigned long index; 3247 - s64 inflight; 3248 - DEFINE_WAIT(wait); 3249 - 3250 - WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); 3251 - 3252 - if (!current->io_uring) 3253 - return; 3254 - if (tctx->io_wq) 3255 - io_wq_exit_start(tctx->io_wq); 3256 - 3257 - atomic_inc(&tctx->in_cancel); 3258 - do { 3259 - bool loop = false; 3260 - 3261 - io_uring_drop_tctx_refs(current); 3262 - if (!tctx_inflight(tctx, !cancel_all)) 3263 - break; 3264 - 3265 - /* read completions before cancelations */ 3266 - inflight = tctx_inflight(tctx, false); 3267 - if (!inflight) 3268 - break; 3269 - 3270 - if (!sqd) { 3271 - xa_for_each(&tctx->xa, index, node) { 3272 - /* sqpoll task will cancel all its requests */ 3273 - if (node->ctx->sq_data) 3274 - continue; 3275 - loop |= io_uring_try_cancel_requests(node->ctx, 3276 - current->io_uring, 3277 - cancel_all, 3278 - false); 3279 - } 3280 - } else { 3281 - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 3282 - loop |= io_uring_try_cancel_requests(ctx, 3283 - current->io_uring, 3284 - cancel_all, 3285 - true); 3286 - } 3287 - 3288 - if (loop) { 3289 - cond_resched(); 3290 - continue; 3291 - } 3292 - 3293 - prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); 3294 - io_run_task_work(); 3295 - io_uring_drop_tctx_refs(current); 3296 - xa_for_each(&tctx->xa, index, node) { 3297 - if (io_local_work_pending(node->ctx)) { 3298 - WARN_ON_ONCE(node->ctx->submitter_task && 3299 - node->ctx->submitter_task != current); 3300 - goto end_wait; 3301 - } 3302 - } 3303 - /* 3304 - * If we've seen completions, retry without waiting. This 3305 - * avoids a race where a completion comes in before we did 3306 - * prepare_to_wait(). 3307 - */ 3308 - if (inflight == tctx_inflight(tctx, !cancel_all)) 3309 - schedule(); 3310 - end_wait: 3311 - finish_wait(&tctx->wait, &wait); 3312 - } while (1); 3313 - 3314 - io_uring_clean_tctx(tctx); 3315 - if (cancel_all) { 3316 - /* 3317 - * We shouldn't run task_works after cancel, so just leave 3318 - * ->in_cancel set for normal exit. 3319 - */ 3320 - atomic_dec(&tctx->in_cancel); 3321 - /* for exec all current's requests should be gone, kill tctx */ 3322 - __io_uring_free(current); 3323 - } 3324 - } 3325 - 3326 - void __io_uring_cancel(bool cancel_all) 3327 - { 3328 - io_uring_unreg_ringfd(); 3329 - io_uring_cancel_generic(cancel_all, NULL); 3330 3121 } 3331 3122 3332 3123 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, ··· 3355 3594 } 3356 3595 3357 3596 static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, 3358 - struct io_uring_params *p) 3597 + struct io_ctx_config *config) 3359 3598 { 3599 + struct io_uring_params *p = &config->p; 3600 + struct io_rings_layout *rl = &config->layout; 3360 3601 struct io_uring_region_desc rd; 3361 3602 struct io_rings *rings; 3362 - size_t size, sq_array_offset; 3363 3603 int ret; 3364 3604 3365 3605 /* make sure these are sane, as we already accounted them */ 3366 3606 ctx->sq_entries = p->sq_entries; 3367 3607 ctx->cq_entries = p->cq_entries; 3368 3608 3369 - size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, 3370 - &sq_array_offset); 3371 - if (size == SIZE_MAX) 3372 - return -EOVERFLOW; 3373 - 3374 3609 memset(&rd, 0, sizeof(rd)); 3375 - rd.size = PAGE_ALIGN(size); 3610 + rd.size = PAGE_ALIGN(rl->rings_size); 3376 3611 if (ctx->flags & IORING_SETUP_NO_MMAP) { 3377 3612 rd.user_addr = p->cq_off.user_addr; 3378 3613 rd.flags |= IORING_MEM_REGION_TYPE_USER; ··· 3377 3620 if (ret) 3378 3621 return ret; 3379 3622 ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); 3380 - 3381 3623 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 3382 - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); 3383 - rings->sq_ring_mask = p->sq_entries - 1; 3384 - rings->cq_ring_mask = p->cq_entries - 1; 3385 - rings->sq_ring_entries = p->sq_entries; 3386 - rings->cq_ring_entries = p->cq_entries; 3387 - 3388 - if (p->flags & IORING_SETUP_SQE128) 3389 - size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries); 3390 - else 3391 - size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); 3392 - if (size == SIZE_MAX) { 3393 - io_rings_free(ctx); 3394 - return -EOVERFLOW; 3395 - } 3624 + ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); 3396 3625 3397 3626 memset(&rd, 0, sizeof(rd)); 3398 - rd.size = PAGE_ALIGN(size); 3627 + rd.size = PAGE_ALIGN(rl->sq_size); 3399 3628 if (ctx->flags & IORING_SETUP_NO_MMAP) { 3400 3629 rd.user_addr = p->sq_off.user_addr; 3401 3630 rd.flags |= IORING_MEM_REGION_TYPE_USER; ··· 3392 3649 return ret; 3393 3650 } 3394 3651 ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); 3652 + 3653 + memset(rings, 0, sizeof(*rings)); 3654 + WRITE_ONCE(rings->sq_ring_mask, ctx->sq_entries - 1); 3655 + WRITE_ONCE(rings->cq_ring_mask, ctx->cq_entries - 1); 3656 + WRITE_ONCE(rings->sq_ring_entries, ctx->sq_entries); 3657 + WRITE_ONCE(rings->cq_ring_entries, ctx->cq_entries); 3395 3658 return 0; 3396 3659 } 3397 3660 ··· 3427 3678 static int io_uring_sanitise_params(struct io_uring_params *p) 3428 3679 { 3429 3680 unsigned flags = p->flags; 3681 + 3682 + if (flags & ~IORING_SETUP_FLAGS) 3683 + return -EINVAL; 3430 3684 3431 3685 /* There is no way to mmap rings without a real fd */ 3432 3686 if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && ··· 3469 3717 if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3470 3718 (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3471 3719 return -EINVAL; 3720 + /* 3721 + * Nonsensical to ask for SQE128 and mixed SQE support, it's not 3722 + * supported to post 64b SQEs on a ring setup with SQE128. 3723 + */ 3724 + if ((flags & (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) == 3725 + (IORING_SETUP_SQE128|IORING_SETUP_SQE_MIXED)) 3726 + return -EINVAL; 3472 3727 3473 3728 return 0; 3474 3729 } 3475 3730 3476 - int io_uring_fill_params(unsigned entries, struct io_uring_params *p) 3731 + static int io_uring_fill_params(struct io_uring_params *p) 3477 3732 { 3733 + unsigned entries = p->sq_entries; 3734 + 3478 3735 if (!entries) 3479 3736 return -EINVAL; 3480 3737 if (entries > IORING_MAX_ENTRIES) { ··· 3521 3760 p->cq_entries = 2 * p->sq_entries; 3522 3761 } 3523 3762 3763 + return 0; 3764 + } 3765 + 3766 + int io_prepare_config(struct io_ctx_config *config) 3767 + { 3768 + struct io_uring_params *p = &config->p; 3769 + int ret; 3770 + 3771 + ret = io_uring_sanitise_params(p); 3772 + if (ret) 3773 + return ret; 3774 + 3775 + ret = io_uring_fill_params(p); 3776 + if (ret) 3777 + return ret; 3778 + 3779 + ret = rings_size(p->flags, p->sq_entries, p->cq_entries, 3780 + &config->layout); 3781 + if (ret) 3782 + return ret; 3783 + 3524 3784 p->sq_off.head = offsetof(struct io_rings, sq.head); 3525 3785 p->sq_off.tail = offsetof(struct io_rings, sq.tail); 3526 3786 p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); ··· 3562 3780 p->cq_off.resv1 = 0; 3563 3781 if (!(p->flags & IORING_SETUP_NO_MMAP)) 3564 3782 p->cq_off.user_addr = 0; 3783 + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) 3784 + p->sq_off.array = config->layout.sq_array_offset; 3565 3785 3566 3786 return 0; 3567 3787 } 3568 3788 3569 - static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, 3570 - struct io_uring_params __user *params) 3789 + static __cold int io_uring_create(struct io_ctx_config *config) 3571 3790 { 3791 + struct io_uring_params *p = &config->p; 3572 3792 struct io_ring_ctx *ctx; 3573 3793 struct io_uring_task *tctx; 3574 3794 struct file *file; 3575 3795 int ret; 3576 3796 3577 - ret = io_uring_sanitise_params(p); 3797 + ret = io_prepare_config(config); 3578 3798 if (ret) 3579 - return ret; 3580 - 3581 - ret = io_uring_fill_params(entries, p); 3582 - if (unlikely(ret)) 3583 3799 return ret; 3584 3800 3585 3801 ctx = io_ring_ctx_alloc(p); ··· 3637 3857 mmgrab(current->mm); 3638 3858 ctx->mm_account = current->mm; 3639 3859 3640 - ret = io_allocate_scq_urings(ctx, p); 3860 + ret = io_allocate_scq_urings(ctx, config); 3641 3861 if (ret) 3642 3862 goto err; 3643 - 3644 - if (!(p->flags & IORING_SETUP_NO_SQARRAY)) 3645 - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; 3646 3863 3647 3864 ret = io_sq_offload_create(ctx, p); 3648 3865 if (ret) ··· 3647 3870 3648 3871 p->features = IORING_FEAT_FLAGS; 3649 3872 3650 - if (copy_to_user(params, p, sizeof(*p))) { 3873 + if (copy_to_user(config->uptr, p, sizeof(*p))) { 3651 3874 ret = -EFAULT; 3652 3875 goto err; 3653 3876 } ··· 3700 3923 */ 3701 3924 static long io_uring_setup(u32 entries, struct io_uring_params __user *params) 3702 3925 { 3703 - struct io_uring_params p; 3704 - int i; 3926 + struct io_ctx_config config; 3705 3927 3706 - if (copy_from_user(&p, params, sizeof(p))) 3928 + memset(&config, 0, sizeof(config)); 3929 + 3930 + if (copy_from_user(&config.p, params, sizeof(config.p))) 3707 3931 return -EFAULT; 3708 - for (i = 0; i < ARRAY_SIZE(p.resv); i++) { 3709 - if (p.resv[i]) 3710 - return -EINVAL; 3711 - } 3712 3932 3713 - if (p.flags & ~IORING_SETUP_FLAGS) 3933 + if (!mem_is_zero(&config.p.resv, sizeof(config.p.resv))) 3714 3934 return -EINVAL; 3715 - return io_uring_create(entries, &p, params); 3935 + 3936 + config.p.sq_entries = entries; 3937 + config.uptr = params; 3938 + return io_uring_create(&config); 3716 3939 } 3717 3940 3718 3941 static inline int io_uring_allowed(void)

+29 -34

io_uring/io_uring.h

··· 17 17 #include <trace/events/io_uring.h> 18 18 #endif 19 19 20 + struct io_rings_layout { 21 + /* size of CQ + headers + SQ offset array */ 22 + size_t rings_size; 23 + size_t sq_size; 24 + 25 + size_t sq_array_offset; 26 + }; 27 + 28 + struct io_ctx_config { 29 + struct io_uring_params p; 30 + struct io_rings_layout layout; 31 + struct io_uring_params __user *uptr; 32 + }; 33 + 20 34 #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ 21 35 IORING_FEAT_NODROP |\ 22 36 IORING_FEAT_SUBMIT_STABLE |\ ··· 68 54 IORING_SETUP_REGISTERED_FD_ONLY |\ 69 55 IORING_SETUP_NO_SQARRAY |\ 70 56 IORING_SETUP_HYBRID_IOPOLL |\ 71 - IORING_SETUP_CQE_MIXED) 57 + IORING_SETUP_CQE_MIXED |\ 58 + IORING_SETUP_SQE_MIXED) 72 59 73 60 #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ 74 61 IORING_ENTER_SQ_WAKEUP |\ ··· 110 95 IOU_REQUEUE = -3072, 111 96 }; 112 97 98 + struct io_defer_entry { 99 + struct list_head list; 100 + struct io_kiocb *req; 101 + }; 102 + 113 103 struct io_wait_queue { 114 104 struct wait_queue_entry wq; 115 105 struct io_ring_ctx *ctx; ··· 148 128 #define IORING_MAX_ENTRIES 32768 149 129 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) 150 130 151 - unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 152 - unsigned int cq_entries, size_t *sq_offset); 153 - int io_uring_fill_params(unsigned entries, struct io_uring_params *p); 131 + int io_prepare_config(struct io_ctx_config *config); 132 + 154 133 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 155 134 int io_run_task_work_sig(struct io_ring_ctx *ctx); 135 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); 156 136 void io_req_defer_failed(struct io_kiocb *req, s32 res); 157 137 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 158 138 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 160 140 bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); 161 141 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 162 142 143 + unsigned io_linked_nr(struct io_kiocb *req); 163 144 void io_req_track_inflight(struct io_kiocb *req); 164 145 struct file *io_file_get_normal(struct io_kiocb *req, int fd); 165 146 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, ··· 169 148 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 170 149 void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); 171 150 void io_req_task_queue(struct io_kiocb *req); 172 - void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); 151 + void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); 173 152 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 174 - void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); 153 + void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); 175 154 struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 176 155 struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 177 156 void tctx_task_work(struct callback_head *cb); 178 - __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); 157 + __cold void io_uring_drop_tctx_refs(struct task_struct *task); 179 158 180 159 int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, 181 160 int start, int end); ··· 184 163 int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); 185 164 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); 186 165 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); 166 + __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx); 187 167 void __io_submit_flush_completions(struct io_ring_ctx *ctx); 188 168 189 169 struct io_wq_work *io_wq_free_work(struct io_wq_work *work); ··· 194 172 void io_queue_next(struct io_kiocb *req); 195 173 void io_task_refs_refill(struct io_uring_task *tctx); 196 174 bool __io_alloc_req_refill(struct io_ring_ctx *ctx); 197 - 198 - bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, 199 - bool cancel_all); 200 175 201 176 void io_activate_pollwq(struct io_ring_ctx *ctx); 202 177 ··· 577 558 ctx->submitter_task == current); 578 559 } 579 560 580 - /* 581 - * Terminate the request if either of these conditions are true: 582 - * 583 - * 1) It's being executed by the original task, but that task is marked 584 - * with PF_EXITING as it's exiting. 585 - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 586 - * our fallback task_work. 587 - */ 588 - static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 589 - { 590 - return (current->flags & (PF_KTHREAD | PF_EXITING)) || percpu_ref_is_dying(&ctx->refs); 591 - } 592 - 593 561 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) 594 562 { 595 563 io_req_set_res(req, res, 0); 596 564 req->io_task_work.func = io_req_task_complete; 597 565 io_req_task_work_add(req); 598 - } 599 - 600 - /* 601 - * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 602 - * slot. 603 - */ 604 - static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) 605 - { 606 - if (ctx->flags & IORING_SETUP_SQE128) 607 - return 2 * sizeof(struct io_uring_sqe); 608 - return sizeof(struct io_uring_sqe); 609 566 } 610 567 611 568 static inline bool io_file_can_poll(struct io_kiocb *req)

+3 -3

io_uring/kbuf.c

··· 428 428 static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 429 429 { 430 430 if (bl->flags & IOBL_BUF_RING) 431 - io_free_region(ctx, &bl->region); 431 + io_free_region(ctx->user, &bl->region); 432 432 else 433 433 io_remove_buffers_legacy(ctx, bl, -1U); 434 434 ··· 641 641 rd.user_addr = reg.ring_addr; 642 642 rd.flags |= IORING_MEM_REGION_TYPE_USER; 643 643 } 644 - ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); 644 + ret = io_create_region(ctx, &bl->region, &rd, mmap_offset); 645 645 if (ret) 646 646 goto fail; 647 647 br = io_region_get_ptr(&bl->region); ··· 672 672 io_buffer_add_list(ctx, bl, reg.bgid); 673 673 return 0; 674 674 fail: 675 - io_free_region(ctx, &bl->region); 675 + io_free_region(ctx->user, &bl->region); 676 676 kfree(bl); 677 677 return ret; 678 678 }

+2 -3

io_uring/kbuf.h

··· 14 14 15 15 struct io_buffer_list { 16 16 /* 17 - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, 18 - * then these are classic provided buffers and ->buf_list is used. 17 + * If the IOBL_BUF_RING flag is set, then buf_ring is used. If not, then 18 + * these are classic provided buffers and ->buf_list is used. 19 19 */ 20 20 union { 21 21 struct list_head buf_list; ··· 27 27 __u16 bgid; 28 28 29 29 /* below is for ring provided buffers */ 30 - __u16 buf_nr_pages; 31 30 __u16 nr_entries; 32 31 __u16 head; 33 32 __u16 mask;

+17 -42

io_uring/memmap.c

··· 15 15 #include "rsrc.h" 16 16 #include "zcrx.h" 17 17 18 - static void *io_mem_alloc_compound(struct page **pages, int nr_pages, 19 - size_t size, gfp_t gfp) 18 + static bool io_mem_alloc_compound(struct page **pages, int nr_pages, 19 + size_t size, gfp_t gfp) 20 20 { 21 21 struct page *page; 22 22 int i, order; 23 23 24 24 order = get_order(size); 25 25 if (order > MAX_PAGE_ORDER) 26 - return ERR_PTR(-ENOMEM); 26 + return false; 27 27 else if (order) 28 28 gfp |= __GFP_COMP; 29 29 30 30 page = alloc_pages(gfp, order); 31 31 if (!page) 32 - return ERR_PTR(-ENOMEM); 32 + return false; 33 33 34 34 for (i = 0; i < nr_pages; i++) 35 35 pages[i] = page + i; 36 36 37 - return page_address(page); 37 + return true; 38 38 } 39 39 40 40 struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) ··· 88 88 IO_REGION_F_SINGLE_REF = 4, 89 89 }; 90 90 91 - void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) 91 + void io_free_region(struct user_struct *user, struct io_mapped_region *mr) 92 92 { 93 93 if (mr->pages) { 94 94 long nr_refs = mr->nr_pages; ··· 105 105 } 106 106 if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) 107 107 vunmap(mr->ptr); 108 - if (mr->nr_pages && ctx->user) 109 - __io_unaccount_mem(ctx->user, mr->nr_pages); 108 + if (mr->nr_pages && user) 109 + __io_unaccount_mem(user, mr->nr_pages); 110 110 111 111 memset(mr, 0, sizeof(*mr)); 112 112 } ··· 131 131 return 0; 132 132 } 133 133 134 - static int io_region_pin_pages(struct io_ring_ctx *ctx, 135 - struct io_mapped_region *mr, 136 - struct io_uring_region_desc *reg) 134 + static int io_region_pin_pages(struct io_mapped_region *mr, 135 + struct io_uring_region_desc *reg) 137 136 { 138 - unsigned long size = (size_t) mr->nr_pages << PAGE_SHIFT; 137 + size_t size = io_region_size(mr); 139 138 struct page **pages; 140 139 int nr_pages; 141 140 ··· 149 150 return 0; 150 151 } 151 152 152 - static int io_region_allocate_pages(struct io_ring_ctx *ctx, 153 - struct io_mapped_region *mr, 153 + static int io_region_allocate_pages(struct io_mapped_region *mr, 154 154 struct io_uring_region_desc *reg, 155 155 unsigned long mmap_offset) 156 156 { 157 157 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 158 - size_t size = (size_t) mr->nr_pages << PAGE_SHIFT; 158 + size_t size = io_region_size(mr); 159 159 unsigned long nr_allocated; 160 160 struct page **pages; 161 - void *p; 162 161 163 162 pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); 164 163 if (!pages) 165 164 return -ENOMEM; 166 165 167 - p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); 168 - if (!IS_ERR(p)) { 166 + if (io_mem_alloc_compound(pages, mr->nr_pages, size, gfp)) { 169 167 mr->flags |= IO_REGION_F_SINGLE_REF; 170 168 goto done; 171 169 } ··· 215 219 mr->nr_pages = nr_pages; 216 220 217 221 if (reg->flags & IORING_MEM_REGION_TYPE_USER) 218 - ret = io_region_pin_pages(ctx, mr, reg); 222 + ret = io_region_pin_pages(mr, reg); 219 223 else 220 - ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); 224 + ret = io_region_allocate_pages(mr, reg, mmap_offset); 221 225 if (ret) 222 226 goto out_free; 223 227 ··· 226 230 goto out_free; 227 231 return 0; 228 232 out_free: 229 - io_free_region(ctx, mr); 233 + io_free_region(ctx->user, mr); 230 234 return ret; 231 - } 232 - 233 - int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 234 - struct io_uring_region_desc *reg, 235 - unsigned long mmap_offset) 236 - { 237 - struct io_mapped_region tmp_mr; 238 - int ret; 239 - 240 - memcpy(&tmp_mr, mr, sizeof(tmp_mr)); 241 - ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); 242 - if (ret) 243 - return ret; 244 - 245 - /* 246 - * Once published mmap can find it without holding only the ->mmap_lock 247 - * and not ->uring_lock. 248 - */ 249 - guard(mutex)(&ctx->mmap_lock); 250 - memcpy(mr, &tmp_mr, sizeof(tmp_mr)); 251 - return 0; 252 235 } 253 236 254 237 static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx,

+18 -6

io_uring/memmap.h

··· 16 16 unsigned long flags); 17 17 int io_uring_mmap(struct file *file, struct vm_area_struct *vma); 18 18 19 - void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); 19 + void io_free_region(struct user_struct *user, struct io_mapped_region *mr); 20 20 int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, 21 21 struct io_uring_region_desc *reg, 22 22 unsigned long mmap_offset); 23 - 24 - int io_create_region_mmap_safe(struct io_ring_ctx *ctx, 25 - struct io_mapped_region *mr, 26 - struct io_uring_region_desc *reg, 27 - unsigned long mmap_offset); 28 23 29 24 static inline void *io_region_get_ptr(struct io_mapped_region *mr) 30 25 { ··· 29 34 static inline bool io_region_is_set(struct io_mapped_region *mr) 30 35 { 31 36 return !!mr->nr_pages; 37 + } 38 + 39 + static inline void io_region_publish(struct io_ring_ctx *ctx, 40 + struct io_mapped_region *src_region, 41 + struct io_mapped_region *dst_region) 42 + { 43 + /* 44 + * Once published mmap can find it without holding only the ->mmap_lock 45 + * and not ->uring_lock. 46 + */ 47 + guard(mutex)(&ctx->mmap_lock); 48 + *dst_region = *src_region; 49 + } 50 + 51 + static inline size_t io_region_size(struct io_mapped_region *mr) 52 + { 53 + return (size_t) mr->nr_pages << PAGE_SHIFT; 32 54 } 33 55 34 56 #endif

+2 -1

io_uring/msg_ring.c

··· 70 70 return target_ctx->task_complete; 71 71 } 72 72 73 - static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) 73 + static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) 74 74 { 75 + struct io_kiocb *req = tw_req.req; 75 76 struct io_ring_ctx *ctx = req->ctx; 76 77 77 78 io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);

+2 -5

io_uring/net.c

··· 110 110 111 111 struct io_recvzc { 112 112 struct file *file; 113 - unsigned msg_flags; 114 113 u16 flags; 115 114 u32 len; 116 115 struct io_zcrx_ifq *ifq; ··· 1252 1253 1253 1254 zc->len = READ_ONCE(sqe->len); 1254 1255 zc->flags = READ_ONCE(sqe->ioprio); 1255 - zc->msg_flags = READ_ONCE(sqe->msg_flags); 1256 - if (zc->msg_flags) 1256 + if (READ_ONCE(sqe->msg_flags)) 1257 1257 return -EINVAL; 1258 1258 if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) 1259 1259 return -EINVAL; ··· 1281 1283 return -ENOTSOCK; 1282 1284 1283 1285 len = zc->len; 1284 - ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, 1285 - issue_flags, &zc->len); 1286 + ret = io_zcrx_recv(req, zc->ifq, sock, 0, issue_flags, &zc->len); 1286 1287 if (len && zc->len == 0) { 1287 1288 io_req_set_res(req, 0, 0); 1288 1289

+4 -3

io_uring/notif.c

··· 11 11 12 12 static const struct ubuf_info_ops io_ubuf_ops; 13 13 14 - static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) 14 + static void io_notif_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw) 15 15 { 16 + struct io_kiocb *notif = tw_req.req; 16 17 struct io_notif_data *nd = io_notif_to_data(notif); 17 18 struct io_ring_ctx *ctx = notif->ctx; 18 19 ··· 35 34 } 36 35 37 36 nd = nd->next; 38 - io_req_task_complete(notif, tw); 37 + io_req_task_complete((struct io_tw_req){notif}, tw); 39 38 } while (nd); 40 39 } 41 40 ··· 93 92 prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); 94 93 prev_notif = cmd_to_io_kiocb(prev_nd); 95 94 96 - /* make sure all noifications can be finished in the same task_work */ 95 + /* make sure all notifications can be finished in the same task_work */ 97 96 if (unlikely(notif->ctx != prev_notif->ctx || 98 97 notif->tctx != prev_notif->tctx)) 99 98 return -EEXIST;

+26

io_uring/opdef.c

··· 575 575 .prep = io_pipe_prep, 576 576 .issue = io_pipe, 577 577 }, 578 + [IORING_OP_NOP128] = { 579 + .audit_skip = 1, 580 + .iopoll = 1, 581 + .is_128 = 1, 582 + .prep = io_nop_prep, 583 + .issue = io_nop, 584 + }, 585 + [IORING_OP_URING_CMD128] = { 586 + .buffer_select = 1, 587 + .needs_file = 1, 588 + .plug = 1, 589 + .iopoll = 1, 590 + .iopoll_queue = 1, 591 + .is_128 = 1, 592 + .async_size = sizeof(struct io_async_cmd), 593 + .prep = io_uring_cmd_prep, 594 + .issue = io_uring_cmd, 595 + }, 578 596 }; 579 597 580 598 const struct io_cold_def io_cold_defs[] = { ··· 842 824 }, 843 825 [IORING_OP_PIPE] = { 844 826 .name = "PIPE", 827 + }, 828 + [IORING_OP_NOP128] = { 829 + .name = "NOP128", 830 + }, 831 + [IORING_OP_URING_CMD128] = { 832 + .name = "URING_CMD128", 833 + .sqe_copy = io_uring_cmd_sqe_copy, 834 + .cleanup = io_uring_cmd_cleanup, 845 835 }, 846 836 }; 847 837

+2

io_uring/opdef.h

··· 27 27 unsigned iopoll_queue : 1; 28 28 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ 29 29 unsigned vectored : 1; 30 + /* set to 1 if this opcode uses 128b sqes in a mixed sq */ 31 + unsigned is_128 : 1; 30 32 31 33 /* size of async data needed, if any */ 32 34 unsigned short async_size;

+7 -6

io_uring/poll.c

··· 224 224 { 225 225 int v; 226 226 227 - if (unlikely(io_should_terminate_tw(req->ctx))) 227 + if (unlikely(tw.cancel)) 228 228 return -ECANCELED; 229 229 230 230 do { ··· 310 310 return IOU_POLL_NO_ACTION; 311 311 } 312 312 313 - void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) 313 + void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw) 314 314 { 315 + struct io_kiocb *req = tw_req.req; 315 316 int ret; 316 317 317 318 ret = io_poll_check_events(req, tw); ··· 333 332 poll = io_kiocb_to_cmd(req, struct io_poll); 334 333 req->cqe.res = mangle_poll(req->cqe.res & poll->events); 335 334 } else if (ret == IOU_POLL_REISSUE) { 336 - io_req_task_submit(req, tw); 335 + io_req_task_submit(tw_req, tw); 337 336 return; 338 337 } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { 339 338 req->cqe.res = ret; ··· 341 340 } 342 341 343 342 io_req_set_res(req, req->cqe.res, 0); 344 - io_req_task_complete(req, tw); 343 + io_req_task_complete(tw_req, tw); 345 344 } else { 346 345 io_tw_lock(req->ctx, tw); 347 346 348 347 if (ret == IOU_POLL_REMOVE_POLL_USE_RES) 349 - io_req_task_complete(req, tw); 348 + io_req_task_complete(tw_req, tw); 350 349 else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) 351 - io_req_task_submit(req, tw); 350 + io_req_task_submit(tw_req, tw); 352 351 else 353 352 io_req_defer_failed(req, ret); 354 353 }

+1 -1

io_uring/poll.h

··· 46 46 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, 47 47 bool cancel_all); 48 48 49 - void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); 49 + void io_poll_task_func(struct io_tw_req tw_req, io_tw_token_t tw);

+44 -11

io_uring/query.c

··· 4 4 5 5 #include "query.h" 6 6 #include "io_uring.h" 7 + #include "zcrx.h" 7 8 8 - #define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) 9 + union io_query_data { 10 + struct io_uring_query_opcode opcodes; 11 + struct io_uring_query_zcrx zcrx; 12 + struct io_uring_query_scq scq; 13 + }; 14 + 15 + #define IO_MAX_QUERY_SIZE sizeof(union io_query_data) 9 16 #define IO_MAX_QUERY_ENTRIES 1000 10 17 11 - static ssize_t io_query_ops(void *data) 18 + static ssize_t io_query_ops(union io_query_data *data) 12 19 { 13 - struct io_uring_query_opcode *e = data; 14 - 15 - BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); 20 + struct io_uring_query_opcode *e = &data->opcodes; 16 21 17 22 e->nr_request_opcodes = IORING_OP_LAST; 18 23 e->nr_register_opcodes = IORING_REGISTER_LAST; ··· 30 25 return sizeof(*e); 31 26 } 32 27 33 - static int io_handle_query_entry(struct io_ring_ctx *ctx, 34 - void *data, void __user *uhdr, 28 + static ssize_t io_query_zcrx(union io_query_data *data) 29 + { 30 + struct io_uring_query_zcrx *e = &data->zcrx; 31 + 32 + e->register_flags = ZCRX_REG_IMPORT; 33 + e->area_flags = IORING_ZCRX_AREA_DMABUF; 34 + e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; 35 + e->rq_hdr_size = sizeof(struct io_uring); 36 + e->rq_hdr_alignment = L1_CACHE_BYTES; 37 + e->__resv1 = 0; 38 + e->__resv2 = 0; 39 + return sizeof(*e); 40 + } 41 + 42 + static ssize_t io_query_scq(union io_query_data *data) 43 + { 44 + struct io_uring_query_scq *e = &data->scq; 45 + 46 + e->hdr_size = sizeof(struct io_rings); 47 + e->hdr_alignment = SMP_CACHE_BYTES; 48 + return sizeof(*e); 49 + } 50 + 51 + static int io_handle_query_entry(union io_query_data *data, void __user *uhdr, 35 52 u64 *next_entry) 36 53 { 37 54 struct io_uring_query_hdr hdr; ··· 80 53 case IO_URING_QUERY_OPCODES: 81 54 ret = io_query_ops(data); 82 55 break; 56 + case IO_URING_QUERY_ZCRX: 57 + ret = io_query_zcrx(data); 58 + break; 59 + case IO_URING_QUERY_SCQ: 60 + ret = io_query_scq(data); 61 + break; 83 62 } 84 63 85 64 if (ret >= 0) { ··· 106 73 return 0; 107 74 } 108 75 109 - int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 76 + int io_query(void __user *arg, unsigned nr_args) 110 77 { 111 - char entry_buffer[IO_MAX_QUERY_SIZE]; 78 + union io_query_data entry_buffer; 112 79 void __user *uhdr = arg; 113 80 int ret, nr = 0; 114 81 115 - memset(entry_buffer, 0, sizeof(entry_buffer)); 82 + memset(&entry_buffer, 0, sizeof(entry_buffer)); 116 83 117 84 if (nr_args) 118 85 return -EINVAL; ··· 120 87 while (uhdr) { 121 88 u64 next_hdr; 122 89 123 - ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); 90 + ret = io_handle_query_entry(&entry_buffer, uhdr, &next_hdr); 124 91 if (ret) 125 92 return ret; 126 93 uhdr = u64_to_user_ptr(next_hdr);

+1 -1

io_uring/query.h

··· 4 4 5 5 #include <linux/io_uring_types.h> 6 6 7 - int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); 7 + int io_query(void __user *arg, unsigned nr_args); 8 8 9 9 #endif

+48 -57

io_uring/register.c

··· 379 379 }; 380 380 381 381 static void io_register_free_rings(struct io_ring_ctx *ctx, 382 - struct io_uring_params *p, 383 382 struct io_ring_ctx_rings *r) 384 383 { 385 - io_free_region(ctx, &r->sq_region); 386 - io_free_region(ctx, &r->ring_region); 384 + io_free_region(ctx->user, &r->sq_region); 385 + io_free_region(ctx->user, &r->ring_region); 387 386 } 388 387 389 388 #define swap_old(ctx, o, n, field) \ ··· 394 395 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 395 396 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 396 397 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 397 - IORING_SETUP_CQE_MIXED) 398 + IORING_SETUP_CQE_MIXED | IORING_SETUP_SQE_MIXED) 398 399 399 400 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 400 401 { 402 + struct io_ctx_config config; 401 403 struct io_uring_region_desc rd; 402 404 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; 403 - size_t size, sq_array_offset; 404 405 unsigned i, tail, old_head; 405 - struct io_uring_params p; 406 + struct io_uring_params *p = &config.p; 407 + struct io_rings_layout *rl = &config.layout; 406 408 int ret; 409 + 410 + memset(&config, 0, sizeof(config)); 407 411 408 412 /* limited to DEFER_TASKRUN for now */ 409 413 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 410 414 return -EINVAL; 411 - if (copy_from_user(&p, arg, sizeof(p))) 415 + if (copy_from_user(p, arg, sizeof(*p))) 412 416 return -EFAULT; 413 - if (p.flags & ~RESIZE_FLAGS) 417 + if (p->flags & ~RESIZE_FLAGS) 414 418 return -EINVAL; 415 419 416 420 /* properties that are always inherited */ 417 - p.flags |= (ctx->flags & COPY_FLAGS); 421 + p->flags |= (ctx->flags & COPY_FLAGS); 418 422 419 - ret = io_uring_fill_params(p.sq_entries, &p); 423 + ret = io_prepare_config(&config); 420 424 if (unlikely(ret)) 421 425 return ret; 422 426 423 - size = rings_size(p.flags, p.sq_entries, p.cq_entries, 424 - &sq_array_offset); 425 - if (size == SIZE_MAX) 426 - return -EOVERFLOW; 427 - 428 427 memset(&rd, 0, sizeof(rd)); 429 - rd.size = PAGE_ALIGN(size); 430 - if (p.flags & IORING_SETUP_NO_MMAP) { 431 - rd.user_addr = p.cq_off.user_addr; 428 + rd.size = PAGE_ALIGN(rl->rings_size); 429 + if (p->flags & IORING_SETUP_NO_MMAP) { 430 + rd.user_addr = p->cq_off.user_addr; 432 431 rd.flags |= IORING_MEM_REGION_TYPE_USER; 433 432 } 434 - ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 435 - if (ret) { 436 - io_register_free_rings(ctx, &p, &n); 433 + ret = io_create_region(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); 434 + if (ret) 437 435 return ret; 438 - } 436 + 439 437 n.rings = io_region_get_ptr(&n.ring_region); 440 438 441 439 /* ··· 443 447 * intent... Use read/write once helpers from here on to indicate the 444 448 * shared nature of it. 445 449 */ 446 - WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); 447 - WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); 448 - WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); 449 - WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); 450 + WRITE_ONCE(n.rings->sq_ring_mask, p->sq_entries - 1); 451 + WRITE_ONCE(n.rings->cq_ring_mask, p->cq_entries - 1); 452 + WRITE_ONCE(n.rings->sq_ring_entries, p->sq_entries); 453 + WRITE_ONCE(n.rings->cq_ring_entries, p->cq_entries); 450 454 451 - if (copy_to_user(arg, &p, sizeof(p))) { 452 - io_register_free_rings(ctx, &p, &n); 455 + if (copy_to_user(arg, p, sizeof(*p))) { 456 + io_register_free_rings(ctx, &n); 453 457 return -EFAULT; 454 458 } 455 459 456 - if (p.flags & IORING_SETUP_SQE128) 457 - size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); 458 - else 459 - size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); 460 - if (size == SIZE_MAX) { 461 - io_register_free_rings(ctx, &p, &n); 462 - return -EOVERFLOW; 463 - } 464 - 465 460 memset(&rd, 0, sizeof(rd)); 466 - rd.size = PAGE_ALIGN(size); 467 - if (p.flags & IORING_SETUP_NO_MMAP) { 468 - rd.user_addr = p.sq_off.user_addr; 461 + rd.size = PAGE_ALIGN(rl->sq_size); 462 + if (p->flags & IORING_SETUP_NO_MMAP) { 463 + rd.user_addr = p->sq_off.user_addr; 469 464 rd.flags |= IORING_MEM_REGION_TYPE_USER; 470 465 } 471 - ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 466 + ret = io_create_region(ctx, &n.sq_region, &rd, IORING_OFF_SQES); 472 467 if (ret) { 473 - io_register_free_rings(ctx, &p, &n); 468 + io_register_free_rings(ctx, &n); 474 469 return ret; 475 470 } 476 471 n.sq_sqes = io_region_get_ptr(&n.sq_region); ··· 497 510 */ 498 511 tail = READ_ONCE(o.rings->sq.tail); 499 512 old_head = READ_ONCE(o.rings->sq.head); 500 - if (tail - old_head > p.sq_entries) 513 + if (tail - old_head > p->sq_entries) 501 514 goto overflow; 502 515 for (i = old_head; i < tail; i++) { 503 516 unsigned src_head = i & (ctx->sq_entries - 1); 504 - unsigned dst_head = i & (p.sq_entries - 1); 517 + unsigned dst_head = i & (p->sq_entries - 1); 505 518 506 519 n.sq_sqes[dst_head] = o.sq_sqes[src_head]; 507 520 } ··· 510 523 511 524 tail = READ_ONCE(o.rings->cq.tail); 512 525 old_head = READ_ONCE(o.rings->cq.head); 513 - if (tail - old_head > p.cq_entries) { 526 + if (tail - old_head > p->cq_entries) { 514 527 overflow: 515 528 /* restore old rings, and return -EOVERFLOW via cleanup path */ 516 529 ctx->rings = o.rings; ··· 521 534 } 522 535 for (i = old_head; i < tail; i++) { 523 536 unsigned src_head = i & (ctx->cq_entries - 1); 524 - unsigned dst_head = i & (p.cq_entries - 1); 537 + unsigned dst_head = i & (p->cq_entries - 1); 525 538 526 539 n.rings->cqes[dst_head] = o.rings->cqes[src_head]; 527 540 } ··· 537 550 538 551 /* all done, store old pointers and assign new ones */ 539 552 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 540 - ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); 553 + ctx->sq_array = (u32 *)((char *)n.rings + rl->sq_array_offset); 541 554 542 - ctx->sq_entries = p.sq_entries; 543 - ctx->cq_entries = p.cq_entries; 555 + ctx->sq_entries = p->sq_entries; 556 + ctx->cq_entries = p->cq_entries; 544 557 545 558 ctx->rings = n.rings; 546 559 ctx->sq_sqes = n.sq_sqes; ··· 551 564 out: 552 565 spin_unlock(&ctx->completion_lock); 553 566 mutex_unlock(&ctx->mmap_lock); 554 - io_register_free_rings(ctx, &p, to_free); 567 + io_register_free_rings(ctx, to_free); 555 568 556 569 if (ctx->sq_data) 557 570 io_sq_thread_unpark(ctx->sq_data); ··· 565 578 struct io_uring_mem_region_reg reg; 566 579 struct io_uring_region_desc __user *rd_uptr; 567 580 struct io_uring_region_desc rd; 581 + struct io_mapped_region region = {}; 568 582 int ret; 569 583 570 584 if (io_region_is_set(&ctx->param_region)) ··· 589 601 !(ctx->flags & IORING_SETUP_R_DISABLED)) 590 602 return -EINVAL; 591 603 592 - ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, 593 - IORING_MAP_OFF_PARAM_REGION); 604 + ret = io_create_region(ctx, &region, &rd, IORING_MAP_OFF_PARAM_REGION); 594 605 if (ret) 595 606 return ret; 596 607 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { 597 - guard(mutex)(&ctx->mmap_lock); 598 - io_free_region(ctx, &ctx->param_region); 608 + io_free_region(ctx->user, &region); 599 609 return -EFAULT; 600 610 } 601 611 602 612 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { 603 - ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); 613 + ctx->cq_wait_arg = io_region_get_ptr(&region); 604 614 ctx->cq_wait_size = rd.size; 605 615 } 616 + 617 + io_region_publish(ctx, &region, &ctx->param_region); 606 618 return 0; 607 619 } 608 620 ··· 813 825 ret = io_register_mem_region(ctx, arg); 814 826 break; 815 827 case IORING_REGISTER_QUERY: 816 - ret = io_query(ctx, arg, nr_args); 828 + ret = io_query(arg, nr_args); 829 + break; 830 + case IORING_REGISTER_ZCRX_CTRL: 831 + ret = io_zcrx_ctrl(ctx, arg, nr_args); 817 832 break; 818 833 default: 819 834 ret = -EINVAL; ··· 888 897 case IORING_REGISTER_SEND_MSG_RING: 889 898 return io_uring_register_send_msg_ring(arg, nr_args); 890 899 case IORING_REGISTER_QUERY: 891 - return io_query(NULL, arg, nr_args); 900 + return io_query(arg, nr_args); 892 901 } 893 902 return -EINVAL; 894 903 }

+16 -14

io_uring/rsrc.c

··· 56 56 return 0; 57 57 } 58 58 59 - void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 59 + void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 60 + unsigned long nr_pages) 60 61 { 61 - if (ctx->user) 62 - __io_unaccount_mem(ctx->user, nr_pages); 62 + if (user) 63 + __io_unaccount_mem(user, nr_pages); 63 64 64 - if (ctx->mm_account) 65 - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); 65 + if (mm_account) 66 + atomic64_sub(nr_pages, &mm_account->pinned_vm); 66 67 } 67 68 68 - int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) 69 + int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 70 + unsigned long nr_pages) 69 71 { 70 72 int ret; 71 73 72 - if (ctx->user) { 73 - ret = __io_account_mem(ctx->user, nr_pages); 74 + if (user) { 75 + ret = __io_account_mem(user, nr_pages); 74 76 if (ret) 75 77 return ret; 76 78 } 77 79 78 - if (ctx->mm_account) 79 - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); 80 + if (mm_account) 81 + atomic64_add(nr_pages, &mm_account->pinned_vm); 80 82 81 83 return 0; 82 84 } ··· 147 145 } 148 146 149 147 if (imu->acct_pages) 150 - io_unaccount_mem(ctx, imu->acct_pages); 148 + io_unaccount_mem(ctx->user, ctx->mm_account, imu->acct_pages); 151 149 imu->release(imu->priv); 152 150 io_free_imu(ctx, imu); 153 151 } ··· 456 454 return -ENXIO; 457 455 458 456 for (done = 0; done < up->nr_args; done++) { 459 - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { 457 + if (get_user(fd, &fds[done])) { 460 458 ret = -EFAULT; 461 459 break; 462 460 } ··· 470 468 IORING_FILE_INDEX_ALLOC); 471 469 if (ret < 0) 472 470 break; 473 - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { 471 + if (put_user(ret, &fds[done])) { 474 472 __io_close_fixed(req->ctx, issue_flags, ret); 475 473 ret = -EFAULT; 476 474 break; ··· 686 684 if (!imu->acct_pages) 687 685 return 0; 688 686 689 - ret = io_account_mem(ctx, imu->acct_pages); 687 + ret = io_account_mem(ctx->user, ctx->mm_account, imu->acct_pages); 690 688 if (ret) 691 689 imu->acct_pages = 0; 692 690 return ret;

+4 -2

io_uring/rsrc.h

··· 120 120 int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 121 121 122 122 int __io_account_mem(struct user_struct *user, unsigned long nr_pages); 123 - int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); 124 - void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); 123 + int io_account_mem(struct user_struct *user, struct mm_struct *mm_account, 124 + unsigned long nr_pages); 125 + void io_unaccount_mem(struct user_struct *user, struct mm_struct *mm_account, 126 + unsigned long nr_pages); 125 127 126 128 static inline void __io_unaccount_mem(struct user_struct *user, 127 129 unsigned long nr_pages)

+7 -5

io_uring/rw.c

··· 186 186 * This is really a bug in the core code that does this, any issue 187 187 * path should assume that a successful (or -EIOCBQUEUED) return can 188 188 * mean that the underlying data can be gone at any time. But that 189 - * should be fixed seperately, and then this check could be killed. 189 + * should be fixed separately, and then this check could be killed. 190 190 */ 191 191 if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { 192 192 req->flags &= ~REQ_F_NEED_CLEANUP; ··· 348 348 349 349 /* 350 350 * Have to do this validation here, as this is in io_read() rw->len 351 - * might have chanaged due to buffer selection 351 + * might have changed due to buffer selection 352 352 */ 353 353 return io_iov_buffer_select_prep(req); 354 354 } ··· 566 566 return res; 567 567 } 568 568 569 - void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) 569 + void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw) 570 570 { 571 + struct io_kiocb *req = tw_req.req; 572 + 571 573 io_req_io_end(req); 572 574 573 575 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 574 576 req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 575 577 576 578 io_req_rw_cleanup(req, 0); 577 - io_req_task_complete(req, tw); 579 + io_req_task_complete(tw_req, tw); 578 580 } 579 581 580 582 static void io_complete_rw(struct kiocb *kiocb, long res) ··· 1012 1010 iov_iter_restore(&io->iter, &io->iter_state); 1013 1011 } while (ret > 0); 1014 1012 done: 1015 - /* it's faster to check here then delegate to kfree */ 1013 + /* it's faster to check here than delegate to kfree */ 1016 1014 return ret; 1017 1015 } 1018 1016

+1 -1

io_uring/rw.h

··· 46 46 int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); 47 47 void io_readv_writev_cleanup(struct io_kiocb *req); 48 48 void io_rw_fail(struct io_kiocb *req); 49 - void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); 49 + void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw); 50 50 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 51 51 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); 52 52 void io_rw_cache_free(const void *entry);

-18

io_uring/slist.h

··· 67 67 last->next = NULL; 68 68 } 69 69 70 - static inline void __wq_list_splice(struct io_wq_work_list *list, 71 - struct io_wq_work_node *to) 72 - { 73 - list->last->next = to->next; 74 - to->next = list->first; 75 - INIT_WQ_LIST(list); 76 - } 77 - 78 - static inline bool wq_list_splice(struct io_wq_work_list *list, 79 - struct io_wq_work_node *to) 80 - { 81 - if (!wq_list_empty(list)) { 82 - __wq_list_splice(list, to); 83 - return true; 84 - } 85 - return false; 86 - } 87 - 88 70 static inline void wq_stack_add_head(struct io_wq_work_node *node, 89 71 struct io_wq_work_node *stack) 90 72 {

+1

io_uring/sqpoll.c

··· 19 19 #include "io_uring.h" 20 20 #include "tctx.h" 21 21 #include "napi.h" 22 + #include "cancel.h" 22 23 #include "sqpoll.h" 23 24 24 25 #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8

+12 -8

io_uring/timeout.c

··· 68 68 69 69 static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); 70 70 71 - static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) 71 + static void io_timeout_complete(struct io_tw_req tw_req, io_tw_token_t tw) 72 72 { 73 + struct io_kiocb *req = tw_req.req; 73 74 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 74 75 struct io_timeout_data *data = req->async_data; 75 76 struct io_ring_ctx *ctx = req->ctx; ··· 86 85 } 87 86 } 88 87 89 - io_req_task_complete(req, tw); 88 + io_req_task_complete(tw_req, tw); 90 89 } 91 90 92 91 static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) ··· 158 157 io_flush_killed_timeouts(&list, 0); 159 158 } 160 159 161 - static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) 160 + static void io_req_tw_fail_links(struct io_tw_req tw_req, io_tw_token_t tw) 162 161 { 162 + struct io_kiocb *link = tw_req.req; 163 + 163 164 io_tw_lock(link->ctx, tw); 164 165 while (link) { 165 166 struct io_kiocb *nxt = link->link; ··· 171 168 res = link->cqe.res; 172 169 link->link = NULL; 173 170 io_req_set_res(link, res, 0); 174 - io_req_task_complete(link, tw); 171 + io_req_task_complete((struct io_tw_req){link}, tw); 175 172 link = nxt; 176 173 } 177 174 } ··· 320 317 return 0; 321 318 } 322 319 323 - static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) 320 + static void io_req_task_link_timeout(struct io_tw_req tw_req, io_tw_token_t tw) 324 321 { 322 + struct io_kiocb *req = tw_req.req; 325 323 struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); 326 324 struct io_kiocb *prev = timeout->prev; 327 325 int ret; 328 326 329 327 if (prev) { 330 - if (!io_should_terminate_tw(req->ctx)) { 328 + if (!tw.cancel) { 331 329 struct io_cancel_data cd = { 332 330 .ctx = req->ctx, 333 331 .data = prev->cqe.user_data, ··· 339 335 ret = -ECANCELED; 340 336 } 341 337 io_req_set_res(req, ret ?: -ETIME, 0); 342 - io_req_task_complete(req, tw); 338 + io_req_task_complete(tw_req, tw); 343 339 io_put_req(prev); 344 340 } else { 345 341 io_req_set_res(req, -ETIME, 0); 346 - io_req_task_complete(req, tw); 342 + io_req_task_complete(tw_req, tw); 347 343 } 348 344 } 349 345

+17 -17

io_uring/uring_cmd.c

··· 113 113 } 114 114 EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); 115 115 116 - static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) 117 - { 118 - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); 119 - unsigned int flags = IO_URING_F_COMPLETE_DEFER; 120 - 121 - if (io_should_terminate_tw(req->ctx)) 122 - flags |= IO_URING_F_TASK_DEAD; 123 - 124 - /* task_work executor checks the deffered list completion */ 125 - ioucmd->task_work_cb(ioucmd, flags); 126 - } 127 - 128 116 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 129 - io_uring_cmd_tw_t task_work_cb, 117 + io_req_tw_func_t task_work_cb, 130 118 unsigned flags) 131 119 { 132 120 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ··· 122 134 if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) 123 135 return; 124 136 125 - ioucmd->task_work_cb = task_work_cb; 126 - req->io_task_work.func = io_uring_cmd_work; 137 + req->io_task_work.func = task_work_cb; 127 138 __io_req_task_work_add(req, flags); 128 139 } 129 140 EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); ··· 203 216 return 0; 204 217 } 205 218 219 + /* 220 + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each 221 + * slot. 222 + */ 223 + static inline size_t uring_sqe_size(struct io_kiocb *req) 224 + { 225 + if (req->ctx->flags & IORING_SETUP_SQE128 || 226 + req->opcode == IORING_OP_URING_CMD128) 227 + return 2 * sizeof(struct io_uring_sqe); 228 + return sizeof(struct io_uring_sqe); 229 + } 230 + 206 231 void io_uring_cmd_sqe_copy(struct io_kiocb *req) 207 232 { 208 233 struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); ··· 223 224 /* Should not happen, as REQ_F_SQE_COPIED covers this */ 224 225 if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) 225 226 return; 226 - memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); 227 + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req)); 227 228 ioucmd->sqe = ac->sqes; 228 229 } 229 230 ··· 241 242 if (ret) 242 243 return ret; 243 244 244 - if (ctx->flags & IORING_SETUP_SQE128) 245 + if (ctx->flags & IORING_SETUP_SQE128 || 246 + req->opcode == IORING_OP_URING_CMD128) 245 247 issue_flags |= IO_URING_F_SQE128; 246 248 if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) 247 249 issue_flags |= IO_URING_F_CQE32;

+35 -13

io_uring/waitid.c

··· 16 16 #include "waitid.h" 17 17 #include "../kernel/exit.h" 18 18 19 - static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); 19 + static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw); 20 20 21 21 #define IO_WAITID_CANCEL_FLAG BIT(31) 22 22 #define IO_WAITID_REF_MASK GENMASK(30, 0) ··· 109 109 return ret; 110 110 } 111 111 112 + static void io_waitid_remove_wq(struct io_kiocb *req) 113 + { 114 + struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 115 + struct wait_queue_head *head; 116 + 117 + head = READ_ONCE(iw->head); 118 + if (head) { 119 + struct io_waitid_async *iwa = req->async_data; 120 + 121 + iw->head = NULL; 122 + spin_lock_irq(&head->lock); 123 + list_del_init(&iwa->wo.child_wait.entry); 124 + spin_unlock_irq(&head->lock); 125 + } 126 + } 127 + 112 128 static void io_waitid_complete(struct io_kiocb *req, int ret) 113 129 { 114 130 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); ··· 135 119 lockdep_assert_held(&req->ctx->uring_lock); 136 120 137 121 hlist_del_init(&req->hash_node); 122 + io_waitid_remove_wq(req); 138 123 139 124 ret = io_waitid_finish(req, ret); 140 125 if (ret < 0) ··· 146 129 static bool __io_waitid_cancel(struct io_kiocb *req) 147 130 { 148 131 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 149 - struct io_waitid_async *iwa = req->async_data; 132 + 133 + lockdep_assert_held(&req->ctx->uring_lock); 150 134 151 135 /* 152 136 * Mark us canceled regardless of ownership. This will prevent a ··· 159 141 if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) 160 142 return false; 161 143 162 - spin_lock_irq(&iw->head->lock); 163 - list_del_init(&iwa->wo.child_wait.entry); 164 - spin_unlock_irq(&iw->head->lock); 165 144 io_waitid_complete(req, -ECANCELED); 166 145 io_req_queue_tw_complete(req, -ECANCELED); 167 146 return true; ··· 179 164 static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) 180 165 { 181 166 struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); 182 - struct io_waitid_async *iwa = req->async_data; 183 167 184 168 if (!atomic_sub_return(1, &iw->refs)) 185 169 return false; 170 + 171 + io_waitid_remove_wq(req); 186 172 187 173 /* 188 174 * Wakeup triggered, racing with us. It was prevented from ··· 191 175 */ 192 176 req->io_task_work.func = io_waitid_cb; 193 177 io_req_task_work_add(req); 194 - remove_wait_queue(iw->head, &iwa->wo.child_wait); 195 178 return true; 196 179 } 197 180 198 - static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) 181 + static void io_waitid_cb(struct io_tw_req tw_req, io_tw_token_t tw) 199 182 { 183 + struct io_kiocb *req = tw_req.req; 200 184 struct io_waitid_async *iwa = req->async_data; 201 185 struct io_ring_ctx *ctx = req->ctx; 202 186 int ret; ··· 225 209 io_waitid_drop_issue_ref(req); 226 210 return; 227 211 } 228 - 229 - remove_wait_queue(iw->head, &iwa->wo.child_wait); 212 + /* fall through to complete, will kill waitqueue */ 230 213 } 231 214 } 232 215 233 216 io_waitid_complete(req, ret); 234 - io_req_task_complete(req, tw); 217 + io_req_task_complete(tw_req, tw); 235 218 } 236 219 237 220 static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, ··· 246 231 return 0; 247 232 248 233 list_del_init(&wait->entry); 234 + iw->head = NULL; 249 235 250 236 /* cancel is in progress */ 251 237 if (atomic_fetch_inc(&iw->refs) & IO_WAITID_REF_MASK) ··· 273 257 iw->which = READ_ONCE(sqe->len); 274 258 iw->upid = READ_ONCE(sqe->fd); 275 259 iw->options = READ_ONCE(sqe->file_index); 260 + iw->head = NULL; 276 261 iw->infop = u64_to_user_ptr(READ_ONCE(sqe->addr2)); 277 262 return 0; 278 263 } ··· 304 287 * callback. 305 288 */ 306 289 io_ring_submit_lock(ctx, issue_flags); 290 + 291 + /* 292 + * iw->head is valid under the ring lock, and as long as the request 293 + * is on the waitid_list where cancelations may find it. 294 + */ 295 + iw->head = &current->signal->wait_chldexit; 307 296 hlist_add_head(&req->hash_node, &ctx->waitid_list); 308 297 309 298 init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); 310 299 iwa->wo.child_wait.private = req->tctx->task; 311 - iw->head = &current->signal->wait_chldexit; 312 300 add_wait_queue(iw->head, &iwa->wo.child_wait); 313 301 314 302 ret = __do_wait(&iwa->wo); ··· 336 314 } 337 315 338 316 hlist_del_init(&req->hash_node); 339 - remove_wait_queue(iw->head, &iwa->wo.child_wait); 317 + io_waitid_remove_wq(req); 340 318 ret = io_waitid_finish(req, ret); 341 319 342 320 io_ring_submit_unlock(ctx, issue_flags);

+324 -97

io_uring/zcrx.c

··· 8 8 #include <linux/netdevice.h> 9 9 #include <linux/rtnetlink.h> 10 10 #include <linux/skbuff_ref.h> 11 + #include <linux/anon_inodes.h> 11 12 12 13 #include <net/page_pool/helpers.h> 13 14 #include <net/page_pool/memory_provider.h> ··· 171 170 if (folio == last_folio) 172 171 continue; 173 172 last_folio = folio; 174 - res += 1UL << folio_order(folio); 173 + res += folio_nr_pages(folio); 175 174 } 176 175 return res; 177 176 } ··· 201 200 } 202 201 203 202 mem->account_pages = io_count_account_pages(pages, nr_pages); 204 - ret = io_account_mem(ifq->ctx, mem->account_pages); 203 + ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); 205 204 if (ret < 0) 206 205 mem->account_pages = 0; 207 206 ··· 345 344 atomic_inc(io_get_user_counter(niov)); 346 345 } 347 346 348 - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, 347 + static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) 348 + { 349 + offsets->head = offsetof(struct io_uring, head); 350 + offsets->tail = offsetof(struct io_uring, tail); 351 + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 352 + } 353 + 354 + static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, 355 + struct io_zcrx_ifq *ifq, 349 356 struct io_uring_zcrx_ifq_reg *reg, 350 357 struct io_uring_region_desc *rd, 351 358 u32 id) ··· 363 354 void *ptr; 364 355 int ret; 365 356 366 - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 357 + io_fill_zcrx_offsets(&reg->offsets); 358 + off = reg->offsets.rqes; 367 359 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 368 360 if (size > rd->size) 369 361 return -EINVAL; ··· 372 362 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 373 363 mmap_offset += id << IORING_OFF_PBUF_SHIFT; 374 364 375 - ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); 365 + ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); 376 366 if (ret < 0) 377 367 return ret; 378 368 ··· 380 370 ifq->rq_ring = (struct io_uring *)ptr; 381 371 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 382 372 383 - reg->offsets.head = offsetof(struct io_uring, head); 384 - reg->offsets.tail = offsetof(struct io_uring, tail); 385 - reg->offsets.rqes = off; 386 373 return 0; 387 374 } 388 375 389 376 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 390 377 { 391 - io_free_region(ifq->ctx, &ifq->region); 378 + io_free_region(ifq->user, &ifq->region); 392 379 ifq->rq_ring = NULL; 393 380 ifq->rqes = NULL; 394 381 } 395 382 396 - static void io_zcrx_free_area(struct io_zcrx_area *area) 383 + static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, 384 + struct io_zcrx_area *area) 397 385 { 398 - io_zcrx_unmap_area(area->ifq, area); 386 + io_zcrx_unmap_area(ifq, area); 399 387 io_release_area_mem(&area->mem); 400 388 401 389 if (area->mem.account_pages) 402 - io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); 390 + io_unaccount_mem(ifq->user, ifq->mm_account, 391 + area->mem.account_pages); 403 392 404 393 kvfree(area->freelist); 405 394 kvfree(area->nia.niovs); ··· 472 463 return 0; 473 464 err: 474 465 if (area) 475 - io_zcrx_free_area(area); 466 + io_zcrx_free_area(ifq, area); 476 467 return ret; 477 468 } 478 469 ··· 485 476 return NULL; 486 477 487 478 ifq->if_rxq = -1; 488 - ifq->ctx = ctx; 489 479 spin_lock_init(&ifq->rq_lock); 490 480 mutex_init(&ifq->pp_lock); 481 + refcount_set(&ifq->refs, 1); 482 + refcount_set(&ifq->user_refs, 1); 491 483 return ifq; 492 484 } 493 485 ··· 532 522 io_close_queue(ifq); 533 523 534 524 if (ifq->area) 535 - io_zcrx_free_area(ifq->area); 525 + io_zcrx_free_area(ifq, ifq->area); 526 + free_uid(ifq->user); 527 + if (ifq->mm_account) 528 + mmdrop(ifq->mm_account); 536 529 if (ifq->dev) 537 530 put_device(ifq->dev); 538 531 539 532 io_free_rbuf_ring(ifq); 540 533 mutex_destroy(&ifq->pp_lock); 541 534 kfree(ifq); 535 + } 536 + 537 + static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) 538 + { 539 + if (refcount_dec_and_test(&ifq->refs)) 540 + io_zcrx_ifq_free(ifq); 541 + } 542 + 543 + static void io_zcrx_return_niov_freelist(struct net_iov *niov) 544 + { 545 + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 546 + 547 + spin_lock_bh(&area->freelist_lock); 548 + area->freelist[area->free_count++] = net_iov_idx(niov); 549 + spin_unlock_bh(&area->freelist_lock); 550 + } 551 + 552 + static void io_zcrx_return_niov(struct net_iov *niov) 553 + { 554 + netmem_ref netmem = net_iov_to_netmem(niov); 555 + 556 + if (!niov->desc.pp) { 557 + /* copy fallback allocated niovs */ 558 + io_zcrx_return_niov_freelist(niov); 559 + return; 560 + } 561 + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); 562 + } 563 + 564 + static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 565 + { 566 + struct io_zcrx_area *area = ifq->area; 567 + int i; 568 + 569 + if (!area) 570 + return; 571 + 572 + /* Reclaim back all buffers given to the user space. */ 573 + for (i = 0; i < area->nia.num_niovs; i++) { 574 + struct net_iov *niov = &area->nia.niovs[i]; 575 + int nr; 576 + 577 + if (!atomic_read(io_get_user_counter(niov))) 578 + continue; 579 + nr = atomic_xchg(io_get_user_counter(niov), 0); 580 + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 581 + io_zcrx_return_niov(niov); 582 + } 583 + } 584 + 585 + static void zcrx_unregister(struct io_zcrx_ifq *ifq) 586 + { 587 + if (refcount_dec_and_test(&ifq->user_refs)) { 588 + io_close_queue(ifq); 589 + io_zcrx_scrub(ifq); 590 + } 591 + io_put_zcrx_ifq(ifq); 542 592 } 543 593 544 594 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, ··· 609 539 lockdep_assert_held(&ctx->mmap_lock); 610 540 611 541 return ifq ? &ifq->region : NULL; 542 + } 543 + 544 + static int zcrx_box_release(struct inode *inode, struct file *file) 545 + { 546 + struct io_zcrx_ifq *ifq = file->private_data; 547 + 548 + if (WARN_ON_ONCE(!ifq)) 549 + return -EFAULT; 550 + zcrx_unregister(ifq); 551 + return 0; 552 + } 553 + 554 + static const struct file_operations zcrx_box_fops = { 555 + .owner = THIS_MODULE, 556 + .release = zcrx_box_release, 557 + }; 558 + 559 + static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, 560 + struct zcrx_ctrl *ctrl, void __user *arg) 561 + { 562 + struct zcrx_ctrl_export *ce = &ctrl->zc_export; 563 + struct file *file; 564 + int fd = -1; 565 + 566 + if (!mem_is_zero(ce, sizeof(*ce))) 567 + return -EINVAL; 568 + fd = get_unused_fd_flags(O_CLOEXEC); 569 + if (fd < 0) 570 + return fd; 571 + 572 + ce->zcrx_fd = fd; 573 + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { 574 + put_unused_fd(fd); 575 + return -EFAULT; 576 + } 577 + 578 + refcount_inc(&ifq->refs); 579 + refcount_inc(&ifq->user_refs); 580 + 581 + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, 582 + ifq, O_CLOEXEC, NULL); 583 + if (IS_ERR(file)) { 584 + put_unused_fd(fd); 585 + zcrx_unregister(ifq); 586 + return PTR_ERR(file); 587 + } 588 + 589 + fd_install(fd, file); 590 + return 0; 591 + } 592 + 593 + static int import_zcrx(struct io_ring_ctx *ctx, 594 + struct io_uring_zcrx_ifq_reg __user *arg, 595 + struct io_uring_zcrx_ifq_reg *reg) 596 + { 597 + struct io_zcrx_ifq *ifq; 598 + struct file *file; 599 + int fd, ret; 600 + u32 id; 601 + 602 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 603 + return -EINVAL; 604 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 605 + return -EINVAL; 606 + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) 607 + return -EINVAL; 608 + 609 + fd = reg->if_idx; 610 + CLASS(fd, f)(fd); 611 + if (fd_empty(f)) 612 + return -EBADF; 613 + 614 + file = fd_file(f); 615 + if (file->f_op != &zcrx_box_fops || !file->private_data) 616 + return -EBADF; 617 + 618 + ifq = file->private_data; 619 + refcount_inc(&ifq->refs); 620 + refcount_inc(&ifq->user_refs); 621 + 622 + scoped_guard(mutex, &ctx->mmap_lock) { 623 + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); 624 + if (ret) 625 + goto err; 626 + } 627 + 628 + reg->zcrx_id = id; 629 + io_fill_zcrx_offsets(&reg->offsets); 630 + if (copy_to_user(arg, reg, sizeof(*reg))) { 631 + ret = -EFAULT; 632 + goto err_xa_erase; 633 + } 634 + 635 + scoped_guard(mutex, &ctx->mmap_lock) { 636 + ret = -ENOMEM; 637 + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) 638 + goto err_xa_erase; 639 + } 640 + 641 + return 0; 642 + err_xa_erase: 643 + scoped_guard(mutex, &ctx->mmap_lock) 644 + xa_erase(&ctx->zcrx_ctxs, id); 645 + err: 646 + zcrx_unregister(ifq); 647 + return ret; 612 648 } 613 649 614 650 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ··· 742 566 return -EINVAL; 743 567 if (copy_from_user(&reg, arg, sizeof(reg))) 744 568 return -EFAULT; 745 - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 746 - return -EFAULT; 747 569 if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || 748 570 reg.__resv2 || reg.zcrx_id) 749 571 return -EINVAL; 572 + if (reg.flags & ZCRX_REG_IMPORT) 573 + return import_zcrx(ctx, arg, &reg); 574 + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 575 + return -EFAULT; 750 576 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 751 577 return -EINVAL; 752 578 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { ··· 764 586 ifq = io_zcrx_ifq_alloc(ctx); 765 587 if (!ifq) 766 588 return -ENOMEM; 589 + 590 + if (ctx->user) { 591 + get_uid(ctx->user); 592 + ifq->user = ctx->user; 593 + } 594 + if (ctx->mm_account) { 595 + mmgrab(ctx->mm_account); 596 + ifq->mm_account = ctx->mm_account; 597 + } 767 598 ifq->rq_entries = reg.rq_entries; 768 599 769 600 scoped_guard(mutex, &ctx->mmap_lock) { ··· 782 595 goto ifq_free; 783 596 } 784 597 785 - ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id); 598 + ret = io_allocate_rbuf_ring(ctx, ifq, &reg, &rd, id); 786 599 if (ret) 787 600 goto err; 788 601 789 - ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, 790 - &ifq->netdev_tracker, GFP_KERNEL); 602 + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); 791 603 if (!ifq->netdev) { 792 604 ret = -ENODEV; 793 605 goto err; 794 606 } 607 + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 795 608 796 609 ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); 797 610 if (!ifq->dev) { 798 611 ret = -EOPNOTSUPP; 799 - goto err; 612 + goto netdev_put_unlock; 800 613 } 801 614 get_device(ifq->dev); 802 615 803 616 ret = io_zcrx_create_area(ifq, &area); 804 617 if (ret) 805 - goto err; 618 + goto netdev_put_unlock; 806 619 807 620 mp_param.mp_ops = &io_uring_pp_zc_ops; 808 621 mp_param.mp_priv = ifq; 809 - ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); 622 + ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); 810 623 if (ret) 811 - goto err; 624 + goto netdev_put_unlock; 625 + netdev_unlock(ifq->netdev); 812 626 ifq->if_rxq = reg.if_rxq; 813 627 814 628 reg.zcrx_id = id; ··· 828 640 goto err; 829 641 } 830 642 return 0; 643 + netdev_put_unlock: 644 + netdev_put(ifq->netdev, &ifq->netdev_tracker); 645 + netdev_unlock(ifq->netdev); 831 646 err: 832 647 scoped_guard(mutex, &ctx->mmap_lock) 833 648 xa_erase(&ctx->zcrx_ctxs, id); 834 649 ifq_free: 835 650 io_zcrx_ifq_free(ifq); 836 651 return ret; 652 + } 653 + 654 + static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 655 + { 656 + unsigned niov_idx; 657 + 658 + lockdep_assert_held(&area->freelist_lock); 659 + 660 + niov_idx = area->freelist[--area->free_count]; 661 + return &area->nia.niovs[niov_idx]; 837 662 } 838 663 839 664 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) ··· 865 664 } 866 665 if (!ifq) 867 666 break; 868 - io_zcrx_ifq_free(ifq); 667 + zcrx_unregister(ifq); 869 668 } 870 669 871 670 xa_destroy(&ctx->zcrx_ctxs); 872 - } 873 - 874 - static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 875 - { 876 - unsigned niov_idx; 877 - 878 - lockdep_assert_held(&area->freelist_lock); 879 - 880 - niov_idx = area->freelist[--area->free_count]; 881 - return &area->nia.niovs[niov_idx]; 882 - } 883 - 884 - static void io_zcrx_return_niov_freelist(struct net_iov *niov) 885 - { 886 - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 887 - 888 - spin_lock_bh(&area->freelist_lock); 889 - area->freelist[area->free_count++] = net_iov_idx(niov); 890 - spin_unlock_bh(&area->freelist_lock); 891 - } 892 - 893 - static void io_zcrx_return_niov(struct net_iov *niov) 894 - { 895 - netmem_ref netmem = net_iov_to_netmem(niov); 896 - 897 - if (!niov->pp) { 898 - /* copy fallback allocated niovs */ 899 - io_zcrx_return_niov_freelist(niov); 900 - return; 901 - } 902 - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); 903 - } 904 - 905 - static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) 906 - { 907 - struct io_zcrx_area *area = ifq->area; 908 - int i; 909 - 910 - if (!area) 911 - return; 912 - 913 - /* Reclaim back all buffers given to the user space. */ 914 - for (i = 0; i < area->nia.num_niovs; i++) { 915 - struct net_iov *niov = &area->nia.niovs[i]; 916 - int nr; 917 - 918 - if (!atomic_read(io_get_user_counter(niov))) 919 - continue; 920 - nr = atomic_xchg(io_get_user_counter(niov), 0); 921 - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) 922 - io_zcrx_return_niov(niov); 923 - } 924 - } 925 - 926 - void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 927 - { 928 - struct io_zcrx_ifq *ifq; 929 - unsigned long index; 930 - 931 - lockdep_assert_held(&ctx->uring_lock); 932 - 933 - xa_for_each(&ctx->zcrx_ctxs, index, ifq) { 934 - io_zcrx_scrub(ifq); 935 - io_close_queue(ifq); 936 - } 937 671 } 938 672 939 673 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) ··· 936 800 if (!page_pool_unref_and_test(netmem)) 937 801 continue; 938 802 939 - if (unlikely(niov->pp != pp)) { 803 + if (unlikely(niov->desc.pp != pp)) { 940 804 io_zcrx_return_niov(niov); 941 805 continue; 942 806 } ··· 1016 880 if (ret) 1017 881 return ret; 1018 882 1019 - percpu_ref_get(&ifq->ctx->refs); 883 + refcount_inc(&ifq->refs); 1020 884 return 0; 1021 885 } 1022 886 1023 887 static void io_pp_zc_destroy(struct page_pool *pp) 1024 888 { 1025 - struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1026 - 1027 - percpu_ref_put(&ifq->ctx->refs); 889 + io_put_zcrx_ifq(io_pp_to_ifq(pp)); 1028 890 } 1029 891 1030 892 static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, ··· 1061 927 .nl_fill = io_pp_nl_fill, 1062 928 .uninstall = io_pp_uninstall, 1063 929 }; 930 + 931 + static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, 932 + struct io_zcrx_ifq *zcrx) 933 + { 934 + unsigned int mask = zcrx->rq_entries - 1; 935 + unsigned int i; 936 + 937 + guard(spinlock_bh)(&zcrx->rq_lock); 938 + 939 + nr = min(nr, io_zcrx_rqring_entries(zcrx)); 940 + for (i = 0; i < nr; i++) { 941 + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); 942 + struct net_iov *niov; 943 + 944 + if (!io_parse_rqe(rqe, zcrx, &niov)) 945 + break; 946 + netmem_array[i] = net_iov_to_netmem(niov); 947 + } 948 + 949 + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); 950 + return i; 951 + } 952 + 953 + #define ZCRX_FLUSH_BATCH 32 954 + 955 + static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) 956 + { 957 + unsigned i; 958 + 959 + for (i = 0; i < nr; i++) { 960 + netmem_ref netmem = netmems[i]; 961 + struct net_iov *niov = netmem_to_net_iov(netmem); 962 + 963 + if (!io_zcrx_put_niov_uref(niov)) 964 + continue; 965 + if (!page_pool_unref_and_test(netmem)) 966 + continue; 967 + io_zcrx_return_niov(niov); 968 + } 969 + } 970 + 971 + static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, 972 + struct zcrx_ctrl *ctrl) 973 + { 974 + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; 975 + netmem_ref netmems[ZCRX_FLUSH_BATCH]; 976 + unsigned total = 0; 977 + unsigned nr; 978 + 979 + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) 980 + return -EINVAL; 981 + 982 + do { 983 + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); 984 + 985 + zcrx_return_buffers(netmems, nr); 986 + total += nr; 987 + 988 + if (fatal_signal_pending(current)) 989 + break; 990 + cond_resched(); 991 + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); 992 + 993 + return 0; 994 + } 995 + 996 + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 997 + { 998 + struct zcrx_ctrl ctrl; 999 + struct io_zcrx_ifq *zcrx; 1000 + 1001 + if (nr_args) 1002 + return -EINVAL; 1003 + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) 1004 + return -EFAULT; 1005 + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) 1006 + return -EFAULT; 1007 + 1008 + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); 1009 + if (!zcrx) 1010 + return -ENXIO; 1011 + 1012 + switch (ctrl.op) { 1013 + case ZCRX_CTRL_FLUSH_RQ: 1014 + return zcrx_flush_rq(ctx, zcrx, &ctrl); 1015 + case ZCRX_CTRL_EXPORT: 1016 + return zcrx_export(ctx, zcrx, &ctrl, arg); 1017 + } 1018 + 1019 + return -EOPNOTSUPP; 1020 + } 1064 1021 1065 1022 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 1066 1023 struct io_zcrx_ifq *ifq, int off, int len) ··· 1294 1069 const skb_frag_t *frag, int off, int len) 1295 1070 { 1296 1071 struct net_iov *niov; 1072 + struct page_pool *pp; 1297 1073 1298 1074 if (unlikely(!skb_frag_is_net_iov(frag))) 1299 1075 return io_zcrx_copy_frag(req, ifq, frag, off, len); 1300 1076 1301 1077 niov = netmem_to_net_iov(frag->netmem); 1302 - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || 1303 - io_pp_to_ifq(niov->pp) != ifq) 1078 + pp = niov->desc.pp; 1079 + 1080 + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) 1304 1081 return -EFAULT; 1305 1082 1306 1083 if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len))

+11 -5

io_uring/zcrx.h

··· 39 39 }; 40 40 41 41 struct io_zcrx_ifq { 42 - struct io_ring_ctx *ctx; 43 42 struct io_zcrx_area *area; 44 43 unsigned niov_shift; 44 + struct user_struct *user; 45 + struct mm_struct *mm_account; 45 46 46 47 spinlock_t rq_lock ____cacheline_aligned_in_smp; 47 48 struct io_uring *rq_ring; ··· 54 53 struct device *dev; 55 54 struct net_device *netdev; 56 55 netdevice_tracker netdev_tracker; 56 + refcount_t refs; 57 + /* counts userspace facing users like io_uring */ 58 + refcount_t user_refs; 57 59 58 60 /* 59 61 * Page pool and net configuration lock, can be taken deeper in the ··· 67 63 }; 68 64 69 65 #if defined(CONFIG_IO_URING_ZCRX) 66 + int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); 70 67 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 71 68 struct io_uring_zcrx_ifq_reg __user *arg); 72 69 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); 73 - void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); 74 70 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 75 71 struct socket *sock, unsigned int flags, 76 72 unsigned issue_flags, unsigned int *len); ··· 85 81 static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 86 82 { 87 83 } 88 - static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) 89 - { 90 - } 91 84 static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 92 85 struct socket *sock, unsigned int flags, 93 86 unsigned issue_flags, unsigned int *len) ··· 95 94 unsigned int id) 96 95 { 97 96 return NULL; 97 + } 98 + static inline int io_zcrx_ctrl(struct io_ring_ctx *ctx, 99 + void __user *arg, unsigned nr_arg) 100 + { 101 + return -EOPNOTSUPP; 98 102 } 99 103 #endif 100 104

+2 -2

net/compat.c

··· 460 460 ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); 461 461 break; 462 462 case SYS_GETSOCKNAME: 463 - ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); 463 + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0); 464 464 break; 465 465 case SYS_GETPEERNAME: 466 - ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); 466 + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1); 467 467 break; 468 468 case SYS_SOCKETPAIR: 469 469 ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));

-1

net/core/dev.h

··· 29 29 netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); 30 30 struct net_device *dev_get_by_napi_id(unsigned int napi_id); 31 31 32 - struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); 33 32 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); 34 33 struct net_device * 35 34 netdev_xa_find_lock(struct net *net, struct net_device *dev,

+29 -54

net/socket.c

··· 2105 2105 return __sys_connect(fd, uservaddr, addrlen); 2106 2106 } 2107 2107 2108 - /* 2109 - * Get the local address ('name') of a socket object. Move the obtained 2110 - * name to user space. 2111 - */ 2108 + int do_getsockname(struct socket *sock, int peer, 2109 + struct sockaddr __user *usockaddr, int __user *usockaddr_len) 2110 + { 2111 + struct sockaddr_storage address; 2112 + int err; 2112 2113 2114 + if (peer) 2115 + err = security_socket_getpeername(sock); 2116 + else 2117 + err = security_socket_getsockname(sock); 2118 + if (err) 2119 + return err; 2120 + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); 2121 + if (err < 0) 2122 + return err; 2123 + /* "err" is actually length in this case */ 2124 + return move_addr_to_user(&address, err, usockaddr, usockaddr_len); 2125 + } 2126 + 2127 + /* 2128 + * Get the remote or local address ('name') of a socket object. Move the 2129 + * obtained name to user space. 2130 + */ 2113 2131 int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, 2114 - int __user *usockaddr_len) 2132 + int __user *usockaddr_len, int peer) 2115 2133 { 2116 2134 struct socket *sock; 2117 - struct sockaddr_storage address; 2118 2135 CLASS(fd, f)(fd); 2119 - int err; 2120 2136 2121 2137 if (fd_empty(f)) 2122 2138 return -EBADF; 2123 2139 sock = sock_from_file(fd_file(f)); 2124 2140 if (unlikely(!sock)) 2125 2141 return -ENOTSOCK; 2126 - 2127 - err = security_socket_getsockname(sock); 2128 - if (err) 2129 - return err; 2130 - 2131 - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); 2132 - if (err < 0) 2133 - return err; 2134 - 2135 - /* "err" is actually length in this case */ 2136 - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); 2142 + return do_getsockname(sock, peer, usockaddr, usockaddr_len); 2137 2143 } 2138 2144 2139 2145 SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, 2140 2146 int __user *, usockaddr_len) 2141 2147 { 2142 - return __sys_getsockname(fd, usockaddr, usockaddr_len); 2143 - } 2144 - 2145 - /* 2146 - * Get the remote address ('name') of a socket object. Move the obtained 2147 - * name to user space. 2148 - */ 2149 - 2150 - int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, 2151 - int __user *usockaddr_len) 2152 - { 2153 - struct socket *sock; 2154 - struct sockaddr_storage address; 2155 - CLASS(fd, f)(fd); 2156 - int err; 2157 - 2158 - if (fd_empty(f)) 2159 - return -EBADF; 2160 - sock = sock_from_file(fd_file(f)); 2161 - if (unlikely(!sock)) 2162 - return -ENOTSOCK; 2163 - 2164 - err = security_socket_getpeername(sock); 2165 - if (err) 2166 - return err; 2167 - 2168 - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1); 2169 - if (err < 0) 2170 - return err; 2171 - 2172 - /* "err" is actually length in this case */ 2173 - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); 2148 + return __sys_getsockname(fd, usockaddr, usockaddr_len, 0); 2174 2149 } 2175 2150 2176 2151 SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, 2177 2152 int __user *, usockaddr_len) 2178 2153 { 2179 - return __sys_getpeername(fd, usockaddr, usockaddr_len); 2154 + return __sys_getsockname(fd, usockaddr, usockaddr_len, 1); 2180 2155 } 2181 2156 2182 2157 /* ··· 3115 3140 case SYS_GETSOCKNAME: 3116 3141 err = 3117 3142 __sys_getsockname(a0, (struct sockaddr __user *)a1, 3118 - (int __user *)a[2]); 3143 + (int __user *)a[2], 0); 3119 3144 break; 3120 3145 case SYS_GETPEERNAME: 3121 3146 err = 3122 - __sys_getpeername(a0, (struct sockaddr __user *)a1, 3123 - (int __user *)a[2]); 3147 + __sys_getsockname(a0, (struct sockaddr __user *)a1, 3148 + (int __user *)a[2], 1); 3124 3149 break; 3125 3150 case SYS_SOCKETPAIR: 3126 3151 err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]);

Configure Feed

Configure Feed