Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

+1 -1

Documentation/networking/iou-zcrx.rst

··· 75 75 76 76 IORING_SETUP_SINGLE_ISSUER 77 77 IORING_SETUP_DEFER_TASKRUN 78 - IORING_SETUP_CQE32 78 + IORING_SETUP_CQE32 or IORING_SETUP_CQE_MIXED 79 79 80 80 Create memory area 81 81 ------------------

+1 -1

block/ioctl.c

··· 776 776 if (bic->res == -EAGAIN && bic->nowait) 777 777 io_uring_cmd_issue_blocking(cmd); 778 778 else 779 - io_uring_cmd_done(cmd, bic->res, 0, issue_flags); 779 + io_uring_cmd_done(cmd, bic->res, issue_flags); 780 780 } 781 781 782 782 static void bio_cmd_bio_end_io(struct bio *bio)

+3 -3

drivers/block/ublk_drv.c

··· 1189 1189 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); 1190 1190 1191 1191 /* tell ublksrv one io request is coming */ 1192 - io_uring_cmd_done(cmd, res, 0, issue_flags); 1192 + io_uring_cmd_done(cmd, res, issue_flags); 1193 1193 } 1194 1194 1195 1195 #define UBLK_REQUEUE_DELAY_MS 3 ··· 1873 1873 spin_unlock(&ubq->cancel_lock); 1874 1874 1875 1875 if (!done) 1876 - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); 1876 + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); 1877 1877 } 1878 1878 1879 1879 /* ··· 2520 2520 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 2521 2521 2522 2522 if (ret != -EIOCBQUEUED) 2523 - io_uring_cmd_done(cmd, ret, 0, issue_flags); 2523 + io_uring_cmd_done(cmd, ret, issue_flags); 2524 2524 } 2525 2525 2526 2526 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)

+1 -1

drivers/nvme/host/ioctl.c

··· 410 410 411 411 if (pdu->bio) 412 412 blk_rq_unmap_user(pdu->bio); 413 - io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); 413 + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); 414 414 } 415 415 416 416 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,

+1 -1

fs/btrfs/ioctl.c

··· 4695 4695 btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); 4696 4696 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 4697 4697 4698 - io_uring_cmd_done(cmd, ret, 0, issue_flags); 4698 + io_uring_cmd_done(cmd, ret, issue_flags); 4699 4699 add_rchar(current, ret); 4700 4700 4701 4701 for (index = 0; index < priv->nr_pages; index++)

+4 -4

fs/fuse/dev_uring.c

··· 351 351 spin_unlock(&queue->lock); 352 352 353 353 if (cmd) 354 - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); 354 + io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); 355 355 356 356 if (req) 357 357 fuse_uring_stop_fuse_req_end(req); ··· 518 518 519 519 if (need_cmd_done) { 520 520 /* no queue lock to avoid lock order issues */ 521 - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); 521 + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); 522 522 } 523 523 } 524 524 ··· 733 733 list_move_tail(&ent->list, &queue->ent_in_userspace); 734 734 spin_unlock(&queue->lock); 735 735 736 - io_uring_cmd_done(cmd, 0, 0, issue_flags); 736 + io_uring_cmd_done(cmd, 0, issue_flags); 737 737 return 0; 738 738 } 739 739 ··· 1200 1200 ent->cmd = NULL; 1201 1201 spin_unlock(&queue->lock); 1202 1202 1203 - io_uring_cmd_done(cmd, ret, 0, issue_flags); 1203 + io_uring_cmd_done(cmd, ret, issue_flags); 1204 1204 } 1205 1205 1206 1206 /*

+49 -20

include/linux/io_uring/cmd.h

··· 11 11 /* io_uring_cmd is being issued again */ 12 12 #define IORING_URING_CMD_REISSUE (1U << 31) 13 13 14 + typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, 15 + unsigned issue_flags); 16 + 14 17 struct io_uring_cmd { 15 18 struct file *file; 16 19 const struct io_uring_sqe *sqe; 17 20 /* callback to defer completions to task context */ 18 - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); 21 + io_uring_cmd_tw_t task_work_cb; 19 22 u32 cmd_op; 20 23 u32 flags; 21 24 u8 pdu[32]; /* available inline for free use */ ··· 56 53 * Note: the caller should never hard code @issue_flags and is only allowed 57 54 * to pass the mask provided by the core io_uring code. 58 55 */ 59 - void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, 60 - unsigned issue_flags); 56 + void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, 57 + unsigned issue_flags, bool is_cqe32); 61 58 62 59 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 63 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 60 + io_uring_cmd_tw_t task_work_cb, 64 61 unsigned flags); 65 62 66 63 /* ··· 72 69 73 70 /* Execute the request from a blocking context */ 74 71 void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); 72 + 73 + /* 74 + * Select a buffer from the provided buffer group for multishot uring_cmd. 75 + * Returns the selected buffer address and size. 76 + */ 77 + struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, 78 + unsigned buf_group, size_t *len, 79 + unsigned int issue_flags); 80 + 81 + /* 82 + * Complete a multishot uring_cmd event. This will post a CQE to the completion 83 + * queue and update the provided buffer. 84 + */ 85 + bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, 86 + struct io_br_sel *sel, unsigned int issue_flags); 75 87 76 88 #else 77 89 static inline int ··· 104 86 { 105 87 return -EOPNOTSUPP; 106 88 } 107 - static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, 108 - u64 ret2, unsigned issue_flags) 89 + static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, 90 + u64 ret2, unsigned issue_flags, bool is_cqe32) 109 91 { 110 92 } 111 93 static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 112 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 113 - unsigned flags) 94 + io_uring_cmd_tw_t task_work_cb, unsigned flags) 114 95 { 115 96 } 116 97 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, ··· 119 102 static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) 120 103 { 121 104 } 122 - #endif 123 - 124 - /* 125 - * Polled completions must ensure they are coming from a poll queue, and 126 - * hence are completed inside the usual poll handling loops. 127 - */ 128 - static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, 129 - ssize_t ret, ssize_t res2) 105 + static inline struct io_br_sel 106 + io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, 107 + size_t *len, unsigned int issue_flags) 130 108 { 131 - lockdep_assert(in_task()); 132 - io_uring_cmd_done(ioucmd, ret, res2, 0); 109 + return (struct io_br_sel) { .val = -EOPNOTSUPP }; 133 110 } 111 + static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, 112 + struct io_br_sel *sel, unsigned int issue_flags) 113 + { 114 + return true; 115 + } 116 + #endif 134 117 135 118 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ 136 119 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 137 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 120 + io_uring_cmd_tw_t task_work_cb) 138 121 { 139 122 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); 140 123 } 141 124 142 125 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 143 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 126 + io_uring_cmd_tw_t task_work_cb) 144 127 { 145 128 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); 146 129 } ··· 157 140 static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) 158 141 { 159 142 return cmd_to_io_kiocb(cmd)->ctx; 143 + } 144 + 145 + static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, 146 + unsigned issue_flags) 147 + { 148 + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); 149 + } 150 + 151 + static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, 152 + u64 res2, unsigned issue_flags) 153 + { 154 + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); 160 155 } 161 156 162 157 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,

+19 -12

include/linux/io_uring_types.h

··· 86 86 }; 87 87 88 88 /* 89 + * Return value from io_buffer_list selection, to avoid stashing it in 90 + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference 91 + * across execution contexts are fine. But for ring provided buffers, the 92 + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb 93 + * persists, it's better to just keep the buffer local for those cases. 94 + */ 95 + struct io_br_sel { 96 + struct io_buffer_list *buf_list; 97 + /* 98 + * Some selection parts return the user address, others return an error. 99 + */ 100 + union { 101 + void __user *addr; 102 + ssize_t val; 103 + }; 104 + }; 105 + 106 + 107 + /* 89 108 * Arbitrary limit, can be raised if need be 90 109 */ 91 110 #define IO_RINGFD_REG_MAX 16 ··· 690 671 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 691 672 struct io_buffer *kbuf; 692 673 693 - /* 694 - * stores buffer ID for ring provided buffers, valid IFF 695 - * REQ_F_BUFFER_RING is set. 696 - */ 697 - struct io_buffer_list *buf_list; 698 - 699 674 struct io_rsrc_node *buf_node; 700 675 }; 701 676 ··· 737 724 struct list_head list; 738 725 struct io_uring_cqe cqe; 739 726 }; 740 - 741 - static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) 742 - { 743 - return ctx->flags & IORING_SETUP_CQE32; 744 - } 745 - 746 727 #endif

+3

include/linux/poison.h

··· 90 90 /********** lib/stackdepot.c **********/ 91 91 #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) 92 92 93 + /********** io_uring/ **********/ 94 + #define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) 95 + 93 96 #endif

+2 -2

include/trace/events/io_uring.h

··· 340 340 __entry->user_data = cqe->user_data; 341 341 __entry->res = cqe->res; 342 342 __entry->cflags = cqe->flags; 343 - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; 344 - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; 343 + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0; 344 + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0; 345 345 ), 346 346 347 347 TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "

+37 -1

include/uapi/linux/io_uring.h

··· 225 225 /* Use hybrid poll in iopoll process */ 226 226 #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) 227 227 228 + /* 229 + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have 230 + * IORING_CQE_F_32 set in cqe->flags. 231 + */ 232 + #define IORING_SETUP_CQE_MIXED (1U << 18) 233 + 228 234 enum io_uring_op { 229 235 IORING_OP_NOP, 230 236 IORING_OP_READV, ··· 304 298 * sqe->uring_cmd_flags top 8bits aren't available for userspace 305 299 * IORING_URING_CMD_FIXED use registered buffer; pass this flag 306 300 * along with setting sqe->buf_index. 301 + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other 302 + * multishot commands. Not compatible with 303 + * IORING_URING_CMD_FIXED, for now. 307 304 */ 308 305 #define IORING_URING_CMD_FIXED (1U << 0) 309 - #define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED 306 + #define IORING_URING_CMD_MULTISHOT (1U << 1) 307 + #define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) 310 308 311 309 312 310 /* ··· 464 454 #define IORING_NOP_FIXED_FILE (1U << 2) 465 455 #define IORING_NOP_FIXED_BUFFER (1U << 3) 466 456 #define IORING_NOP_TW (1U << 4) 457 + #define IORING_NOP_CQE32 (1U << 5) 467 458 468 459 /* 469 460 * IO completion data structure (Completion Queue Entry) ··· 498 487 * other provided buffer type, all completions with a 499 488 * buffer passed back is automatically returned to the 500 489 * application. 490 + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this 491 + * CQE. It's only purpose is to fill a gap in the ring, 492 + * if a large CQE is attempted posted when the ring has 493 + * just a single small CQE worth of space left before 494 + * wrapping. 495 + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings 496 + * setup in a mixed CQE mode, where both 16b and 32b 497 + * CQEs may be posted to the CQ ring. 501 498 */ 502 499 #define IORING_CQE_F_BUFFER (1U << 0) 503 500 #define IORING_CQE_F_MORE (1U << 1) 504 501 #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 505 502 #define IORING_CQE_F_NOTIF (1U << 3) 506 503 #define IORING_CQE_F_BUF_MORE (1U << 4) 504 + #define IORING_CQE_F_SKIP (1U << 5) 505 + #define IORING_CQE_F_32 (1U << 15) 507 506 508 507 #define IORING_CQE_BUFFER_SHIFT 16 509 508 ··· 685 664 IORING_REGISTER_RESIZE_RINGS = 33, 686 665 687 666 IORING_REGISTER_MEM_REGION = 34, 667 + 668 + /* query various aspects of io_uring, see linux/io_uring/query.h */ 669 + IORING_REGISTER_QUERY = 35, 670 + 671 + /* return zcrx buffers back into circulation */ 672 + IORING_REGISTER_ZCRX_REFILL = 36, 688 673 689 674 /* this goes last */ 690 675 IORING_REGISTER_LAST, ··· 1071 1044 __u32 zcrx_id; 1072 1045 __u32 __resv2; 1073 1046 __u64 __resv[3]; 1047 + }; 1048 + 1049 + struct io_uring_zcrx_sync_refill { 1050 + __u32 zcrx_id; 1051 + /* the number of entries to return */ 1052 + __u32 nr_entries; 1053 + /* pointer to an array of struct io_uring_zcrx_rqe */ 1054 + __u64 rqes; 1055 + __u64 __resv[2]; 1074 1056 }; 1075 1057 1076 1058 #ifdef __cplusplus

+41

include/uapi/linux/io_uring/query.h

··· 1 + /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 + /* 3 + * Header file for the io_uring query interface. 4 + */ 5 + #ifndef LINUX_IO_URING_QUERY_H 6 + #define LINUX_IO_URING_QUERY_H 7 + 8 + #include <linux/types.h> 9 + 10 + struct io_uring_query_hdr { 11 + __u64 next_entry; 12 + __u64 query_data; 13 + __u32 query_op; 14 + __u32 size; 15 + __s32 result; 16 + __u32 __resv[3]; 17 + }; 18 + 19 + enum { 20 + IO_URING_QUERY_OPCODES = 0, 21 + 22 + __IO_URING_QUERY_MAX, 23 + }; 24 + 25 + /* Doesn't require a ring */ 26 + struct io_uring_query_opcode { 27 + /* The number of supported IORING_OP_* opcodes */ 28 + __u32 nr_request_opcodes; 29 + /* The number of supported IORING_[UN]REGISTER_* opcodes */ 30 + __u32 nr_register_opcodes; 31 + /* Bitmask of all supported IORING_FEAT_* flags */ 32 + __u64 feature_flags; 33 + /* Bitmask of all supported IORING_SETUP_* flags */ 34 + __u64 ring_setup_flags; 35 + /* Bitmask of all supported IORING_ENTER_** flags */ 36 + __u64 enter_flags; 37 + /* Bitmask of all supported IOSQE_* flags */ 38 + __u64 sqe_flags; 39 + }; 40 + 41 + #endif

+1 -1

io_uring/Makefile

··· 13 13 sync.o msg_ring.o advise.o openclose.o \ 14 14 statx.o timeout.o cancel.o \ 15 15 waitid.o register.o truncate.o \ 16 - memmap.o alloc_cache.o 16 + memmap.o alloc_cache.o query.o 17 17 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 18 18 obj-$(CONFIG_IO_WQ) += io-wq.o 19 19 obj-$(CONFIG_FUTEX) += futex.o

+1

io_uring/cancel.c

··· 11 11 12 12 #include <uapi/linux/io_uring.h> 13 13 14 + #include "filetable.h" 14 15 #include "io_uring.h" 15 16 #include "tctx.h" 16 17 #include "poll.h"

+2 -1

io_uring/cmd_net.c

··· 4 4 #include <net/sock.h> 5 5 6 6 #include "uring_cmd.h" 7 + #include "io_uring.h" 7 8 8 9 static inline int io_uring_cmd_getsockopt(struct socket *sock, 9 10 struct io_uring_cmd *cmd, ··· 74 73 75 74 cqe->user_data = 0; 76 75 cqe->res = tskey; 77 - cqe->flags = IORING_CQE_F_MORE; 76 + cqe->flags = IORING_CQE_F_MORE | ctx_cqe32_flags(cmd_to_io_kiocb(cmd)->ctx); 78 77 cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; 79 78 if (ret == SOF_TIMESTAMPING_TX_HARDWARE) 80 79 cqe->flags |= IORING_CQE_F_TSTAMP_HW;

+13 -11

io_uring/fdinfo.c

··· 9 9 10 10 #include <uapi/linux/io_uring.h> 11 11 12 - #include "io_uring.h" 12 + #include "filetable.h" 13 13 #include "sqpoll.h" 14 14 #include "fdinfo.h" 15 15 #include "cancel.h" ··· 65 65 unsigned int sq_tail = READ_ONCE(r->sq.tail); 66 66 unsigned int cq_head = READ_ONCE(r->cq.head); 67 67 unsigned int cq_tail = READ_ONCE(r->cq.tail); 68 - unsigned int cq_shift = 0; 69 68 unsigned int sq_shift = 0; 70 - unsigned int sq_entries, cq_entries; 69 + unsigned int sq_entries; 71 70 int sq_pid = -1, sq_cpu = -1; 72 71 u64 sq_total_time = 0, sq_work_time = 0; 73 72 unsigned int i; 74 73 75 - if (ctx->flags & IORING_SETUP_CQE32) 76 - cq_shift = 1; 77 74 if (ctx->flags & IORING_SETUP_SQE128) 78 75 sq_shift = 1; 79 76 ··· 122 125 seq_printf(m, "\n"); 123 126 } 124 127 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); 125 - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); 126 - for (i = 0; i < cq_entries; i++) { 127 - unsigned int entry = i + cq_head; 128 - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; 128 + while (cq_head < cq_tail) { 129 + struct io_uring_cqe *cqe; 130 + bool cqe32 = false; 129 131 132 + cqe = &r->cqes[(cq_head & cq_mask)]; 133 + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) 134 + cqe32 = true; 130 135 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", 131 - entry & cq_mask, cqe->user_data, cqe->res, 136 + cq_head & cq_mask, cqe->user_data, cqe->res, 132 137 cqe->flags); 133 - if (cq_shift) 138 + if (cqe32) 134 139 seq_printf(m, ", extra1:%llu, extra2:%llu\n", 135 140 cqe->big_cqe[0], cqe->big_cqe[1]); 136 141 seq_printf(m, "\n"); 142 + cq_head++; 143 + if (cqe32) 144 + cq_head++; 137 145 } 138 146 139 147 if (ctx->flags & IORING_SETUP_SQPOLL) {

+4 -9

io_uring/futex.c

··· 43 43 44 44 static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 45 45 { 46 - req->async_data = NULL; 47 46 hlist_del_init(&req->hash_node); 48 47 io_req_task_complete(req, tw); 49 48 } ··· 53 54 54 55 io_tw_lock(ctx, tw); 55 56 io_cache_free(&ctx->futex_cache, req->async_data); 57 + io_req_async_data_clear(req, 0); 56 58 __io_futex_complete(req, tw); 57 59 } 58 60 ··· 72 72 io_req_set_res(req, res, 0); 73 73 } 74 74 75 - kfree(req->async_data); 76 - req->flags &= ~REQ_F_ASYNC_DATA; 75 + io_req_async_data_free(req); 77 76 __io_futex_complete(req, tw); 78 77 } 79 78 ··· 231 232 io_ring_submit_unlock(ctx, issue_flags); 232 233 req_set_fail(req); 233 234 io_req_set_res(req, ret, 0); 234 - kfree(futexv); 235 - req->async_data = NULL; 236 - req->flags &= ~REQ_F_ASYNC_DATA; 235 + io_req_async_data_free(req); 237 236 return IOU_COMPLETE; 238 237 } 239 238 ··· 307 310 if (ret < 0) 308 311 req_set_fail(req); 309 312 io_req_set_res(req, ret, 0); 310 - req->async_data = NULL; 311 - req->flags &= ~REQ_F_ASYNC_DATA; 312 - kfree(ifd); 313 + io_req_async_data_free(req); 313 314 return IOU_COMPLETE; 314 315 } 315 316

+96 -49

io_uring/io_uring.c

··· 79 79 80 80 #include "io-wq.h" 81 81 82 + #include "filetable.h" 82 83 #include "io_uring.h" 83 84 #include "opdef.h" 84 85 #include "refs.h" ··· 108 107 109 108 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 110 109 IOSQE_IO_HARDLINK | IOSQE_ASYNC) 111 - 112 - #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ 113 - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) 114 110 115 111 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 116 112 ··· 177 179 }; 178 180 #endif 179 181 182 + static void io_poison_cached_req(struct io_kiocb *req) 183 + { 184 + req->ctx = IO_URING_PTR_POISON; 185 + req->tctx = IO_URING_PTR_POISON; 186 + req->file = IO_URING_PTR_POISON; 187 + req->creds = IO_URING_PTR_POISON; 188 + req->io_task_work.func = IO_URING_PTR_POISON; 189 + req->apoll = IO_URING_PTR_POISON; 190 + } 191 + 192 + static void io_poison_req(struct io_kiocb *req) 193 + { 194 + io_poison_cached_req(req); 195 + req->async_data = IO_URING_PTR_POISON; 196 + req->kbuf = IO_URING_PTR_POISON; 197 + req->comp_list.next = IO_URING_PTR_POISON; 198 + req->file_node = IO_URING_PTR_POISON; 199 + req->link = IO_URING_PTR_POISON; 200 + } 201 + 180 202 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 181 203 { 182 204 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); ··· 253 235 254 236 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 255 237 { 238 + if (IS_ENABLED(CONFIG_KASAN)) 239 + io_poison_cached_req(req); 256 240 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 257 241 } 258 242 ··· 614 594 615 595 static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) 616 596 { 617 - size_t cqe_size = sizeof(struct io_uring_cqe); 618 - 619 597 lockdep_assert_held(&ctx->uring_lock); 620 598 621 599 /* don't abort if we're dying, entries must get freed */ 622 600 if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) 623 601 return; 624 602 625 - if (ctx->flags & IORING_SETUP_CQE32) 626 - cqe_size <<= 1; 627 - 628 603 io_cq_lock(ctx); 629 604 while (!list_empty(&ctx->cq_overflow_list)) { 605 + size_t cqe_size = sizeof(struct io_uring_cqe); 630 606 struct io_uring_cqe *cqe; 631 607 struct io_overflow_cqe *ocqe; 608 + bool is_cqe32 = false; 632 609 633 610 ocqe = list_first_entry(&ctx->cq_overflow_list, 634 611 struct io_overflow_cqe, list); 612 + if (ocqe->cqe.flags & IORING_CQE_F_32 || 613 + ctx->flags & IORING_SETUP_CQE32) { 614 + is_cqe32 = true; 615 + cqe_size <<= 1; 616 + } 635 617 636 618 if (!dying) { 637 - if (!io_get_cqe_overflow(ctx, &cqe, true)) 619 + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) 638 620 break; 639 621 memcpy(cqe, &ocqe->cqe, cqe_size); 640 622 } ··· 748 726 { 749 727 struct io_overflow_cqe *ocqe; 750 728 size_t ocq_size = sizeof(struct io_overflow_cqe); 751 - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); 729 + bool is_cqe32 = false; 752 730 753 - if (is_cqe32) 731 + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { 732 + is_cqe32 = true; 754 733 ocq_size += sizeof(struct io_uring_cqe); 734 + } 755 735 756 736 ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); 757 737 trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); ··· 772 748 } 773 749 774 750 /* 751 + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE 752 + * because the ring is a single 16b entry away from wrapping. 753 + */ 754 + static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) 755 + { 756 + if (__io_cqring_events(ctx) < ctx->cq_entries) { 757 + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; 758 + 759 + cqe->user_data = 0; 760 + cqe->res = 0; 761 + cqe->flags = IORING_CQE_F_SKIP; 762 + ctx->cached_cq_tail++; 763 + return true; 764 + } 765 + return false; 766 + } 767 + 768 + /* 775 769 * writes to the cq entry need to come after reading head; the 776 770 * control dependency is enough as we're using WRITE_ONCE to 777 771 * fill the cq entry 778 772 */ 779 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 773 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) 780 774 { 781 775 struct io_rings *rings = ctx->rings; 782 776 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); ··· 808 766 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 809 767 return false; 810 768 769 + /* 770 + * Post dummy CQE if a 32b CQE is needed and there's only room for a 771 + * 16b CQE before the ring wraps. 772 + */ 773 + if (cqe32 && off + 1 == ctx->cq_entries) { 774 + if (!io_fill_nop_cqe(ctx, off)) 775 + return false; 776 + off = 0; 777 + } 778 + 811 779 /* userspace may cheat modifying the tail, be safe and do min */ 812 780 queued = min(__io_cqring_events(ctx), ctx->cq_entries); 813 781 free = ctx->cq_entries - queued; 814 782 /* we need a contiguous range, limit based on the current array offset */ 815 783 len = min(free, ctx->cq_entries - off); 816 - if (!len) 784 + if (len < (cqe32 + 1)) 817 785 return false; 818 786 819 787 if (ctx->flags & IORING_SETUP_CQE32) { ··· 841 789 { 842 790 struct io_uring_cqe *cqe; 843 791 844 - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) 792 + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) 845 793 return false; 846 - if (unlikely(!io_get_cqe(ctx, &cqe))) 794 + if (unlikely(!io_get_cqe(ctx, &cqe, true))) 847 795 return false; 848 796 849 797 memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); ··· 854 802 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, 855 803 u32 cflags) 856 804 { 805 + bool cqe32 = cflags & IORING_CQE_F_32; 857 806 struct io_uring_cqe *cqe; 858 807 859 - if (likely(io_get_cqe(ctx, &cqe))) { 808 + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { 860 809 WRITE_ONCE(cqe->user_data, user_data); 861 810 WRITE_ONCE(cqe->res, res); 862 811 WRITE_ONCE(cqe->flags, cflags); 863 812 864 - if (ctx->flags & IORING_SETUP_CQE32) { 813 + if (cqe32) { 865 814 WRITE_ONCE(cqe->big_cqe[0], 0); 866 815 WRITE_ONCE(cqe->big_cqe[1], 0); 867 816 } ··· 1034 981 lockdep_assert_held(&req->ctx->uring_lock); 1035 982 1036 983 req_set_fail(req); 1037 - io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); 984 + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); 1038 985 if (def->fail) 1039 986 def->fail(req); 1040 987 io_req_complete_defer(req); ··· 2054 2001 2055 2002 switch (io_arm_poll_handler(req, 0)) { 2056 2003 case IO_APOLL_READY: 2057 - io_kbuf_recycle(req, 0); 2058 2004 io_req_task_queue(req); 2059 2005 break; 2060 2006 case IO_APOLL_ABORTED: 2061 - io_kbuf_recycle(req, 0); 2062 2007 io_queue_iowq(req); 2063 2008 break; 2064 2009 case IO_APOLL_OK: ··· 2785 2734 if (check_shl_overflow(off, 1, &off)) 2786 2735 return SIZE_MAX; 2787 2736 } 2737 + if (flags & IORING_SETUP_CQE_MIXED) { 2738 + if (cq_entries < 2) 2739 + return SIZE_MAX; 2740 + } 2788 2741 2789 2742 #ifdef CONFIG_SMP 2790 2743 off = ALIGN(off, SMP_CACHE_BYTES); ··· 2820 2765 2821 2766 while (!io_req_cache_empty(ctx)) { 2822 2767 req = io_extract_req(ctx); 2768 + io_poison_req(req); 2823 2769 kmem_cache_free(req_cachep, req); 2824 2770 nr++; 2825 2771 } ··· 3101 3045 3102 3046 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 3103 3047 /* 3104 - * Use system_unbound_wq to avoid spawning tons of event kworkers 3048 + * Use system_dfl_wq to avoid spawning tons of event kworkers 3105 3049 * if we're exiting a ton of rings at the same time. It just adds 3106 3050 * noise and overhead, there's no discernable change in runtime 3107 - * over using system_wq. 3051 + * over using system_percpu_wq. 3108 3052 */ 3109 3053 queue_work(iou_wq, &ctx->exit_work); 3110 3054 } ··· 3458 3402 struct file *file; 3459 3403 long ret; 3460 3404 3461 - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 3462 - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 3463 - IORING_ENTER_REGISTERED_RING | 3464 - IORING_ENTER_ABS_TIMER | 3465 - IORING_ENTER_EXT_ARG_REG | 3466 - IORING_ENTER_NO_IOWAIT))) 3405 + if (unlikely(flags & ~IORING_ENTER_FLAGS)) 3467 3406 return -EINVAL; 3468 3407 3469 3408 /* ··· 3708 3657 !(flags & IORING_SETUP_SINGLE_ISSUER)) 3709 3658 return -EINVAL; 3710 3659 3660 + /* 3661 + * Nonsensical to ask for CQE32 and mixed CQE support, it's not 3662 + * supported to post 16b CQEs on a ring setup with CQE32. 3663 + */ 3664 + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3665 + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3666 + return -EINVAL; 3667 + 3711 3668 return 0; 3712 3669 } 3713 3670 ··· 3866 3807 if (ret) 3867 3808 goto err; 3868 3809 3869 - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 3870 - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 3871 - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 3872 - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 3873 - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 3874 - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 3875 - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | 3876 - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | 3877 - IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; 3810 + p->features = IORING_FEAT_FLAGS; 3878 3811 3879 3812 if (copy_to_user(params, p, sizeof(*p))) { 3880 3813 ret = -EFAULT; ··· 3874 3823 } 3875 3824 3876 3825 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER 3877 - && !(ctx->flags & IORING_SETUP_R_DISABLED)) 3878 - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 3826 + && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 3827 + /* 3828 + * Unlike io_register_enable_rings(), don't need WRITE_ONCE() 3829 + * since ctx isn't yet accessible from other tasks 3830 + */ 3831 + ctx->submitter_task = get_task_struct(current); 3832 + } 3879 3833 3880 3834 file = io_uring_get_file(ctx); 3881 3835 if (IS_ERR(file)) { ··· 3931 3875 return -EINVAL; 3932 3876 } 3933 3877 3934 - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 3935 - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 3936 - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 3937 - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | 3938 - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | 3939 - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 3940 - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 3941 - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 3942 - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) 3878 + if (p.flags & ~IORING_SETUP_FLAGS) 3943 3879 return -EINVAL; 3944 - 3945 3880 return io_uring_create(entries, &p, params); 3946 3881 } 3947 3882

+105 -15

io_uring/io_uring.h

··· 11 11 #include "alloc_cache.h" 12 12 #include "io-wq.h" 13 13 #include "slist.h" 14 - #include "filetable.h" 15 14 #include "opdef.h" 16 15 17 16 #ifndef CREATE_TRACE_POINTS 18 17 #include <trace/events/io_uring.h> 19 18 #endif 19 + 20 + #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ 21 + IORING_FEAT_NODROP |\ 22 + IORING_FEAT_SUBMIT_STABLE |\ 23 + IORING_FEAT_RW_CUR_POS |\ 24 + IORING_FEAT_CUR_PERSONALITY |\ 25 + IORING_FEAT_FAST_POLL |\ 26 + IORING_FEAT_POLL_32BITS |\ 27 + IORING_FEAT_SQPOLL_NONFIXED |\ 28 + IORING_FEAT_EXT_ARG |\ 29 + IORING_FEAT_NATIVE_WORKERS |\ 30 + IORING_FEAT_RSRC_TAGS |\ 31 + IORING_FEAT_CQE_SKIP |\ 32 + IORING_FEAT_LINKED_FILE |\ 33 + IORING_FEAT_REG_REG_RING |\ 34 + IORING_FEAT_RECVSEND_BUNDLE |\ 35 + IORING_FEAT_MIN_TIMEOUT |\ 36 + IORING_FEAT_RW_ATTR |\ 37 + IORING_FEAT_NO_IOWAIT) 38 + 39 + #define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\ 40 + IORING_SETUP_SQPOLL |\ 41 + IORING_SETUP_SQ_AFF |\ 42 + IORING_SETUP_CQSIZE |\ 43 + IORING_SETUP_CLAMP |\ 44 + IORING_SETUP_ATTACH_WQ |\ 45 + IORING_SETUP_R_DISABLED |\ 46 + IORING_SETUP_SUBMIT_ALL |\ 47 + IORING_SETUP_COOP_TASKRUN |\ 48 + IORING_SETUP_TASKRUN_FLAG |\ 49 + IORING_SETUP_SQE128 |\ 50 + IORING_SETUP_CQE32 |\ 51 + IORING_SETUP_SINGLE_ISSUER |\ 52 + IORING_SETUP_DEFER_TASKRUN |\ 53 + IORING_SETUP_NO_MMAP |\ 54 + IORING_SETUP_REGISTERED_FD_ONLY |\ 55 + IORING_SETUP_NO_SQARRAY |\ 56 + IORING_SETUP_HYBRID_IOPOLL |\ 57 + IORING_SETUP_CQE_MIXED) 58 + 59 + #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ 60 + IORING_ENTER_SQ_WAKEUP |\ 61 + IORING_ENTER_SQ_WAIT |\ 62 + IORING_ENTER_EXT_ARG |\ 63 + IORING_ENTER_REGISTERED_RING |\ 64 + IORING_ENTER_ABS_TIMER |\ 65 + IORING_ENTER_EXT_ARG_REG |\ 66 + IORING_ENTER_NO_IOWAIT) 67 + 68 + 69 + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\ 70 + IOSQE_IO_DRAIN |\ 71 + IOSQE_IO_LINK |\ 72 + IOSQE_IO_HARDLINK |\ 73 + IOSQE_ASYNC |\ 74 + IOSQE_BUFFER_SELECT |\ 75 + IOSQE_CQE_SKIP_SUCCESS) 20 76 21 77 enum { 22 78 IOU_COMPLETE = 0, ··· 131 75 unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 132 76 unsigned int cq_entries, size_t *sq_offset); 133 77 int io_uring_fill_params(unsigned entries, struct io_uring_params *p); 134 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 78 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 135 79 int io_run_task_work_sig(struct io_ring_ctx *ctx); 136 80 void io_req_defer_failed(struct io_kiocb *req, s32 res); 137 81 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 225 169 226 170 static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 227 171 struct io_uring_cqe **ret, 228 - bool overflow) 172 + bool overflow, bool cqe32) 229 173 { 230 174 io_lockdep_assert_cq_locked(ctx); 231 175 232 - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 233 - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 176 + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { 177 + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) 234 178 return false; 235 179 } 236 180 *ret = ctx->cqe_cached; 237 181 ctx->cached_cq_tail++; 238 182 ctx->cqe_cached++; 239 - if (ctx->flags & IORING_SETUP_CQE32) 183 + if (ctx->flags & IORING_SETUP_CQE32) { 240 184 ctx->cqe_cached++; 185 + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { 186 + ctx->cqe_cached++; 187 + ctx->cached_cq_tail++; 188 + } 189 + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); 241 190 return true; 242 191 } 243 192 244 - static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 193 + static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, 194 + bool cqe32) 245 195 { 246 - return io_get_cqe_overflow(ctx, ret, false); 196 + return io_get_cqe_overflow(ctx, ret, false, cqe32); 247 197 } 248 198 249 199 static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, ··· 258 196 io_lockdep_assert_cq_locked(ctx); 259 197 260 198 ctx->submit_state.cq_flush = true; 261 - return io_get_cqe(ctx, cqe_ret); 199 + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); 262 200 } 263 201 264 202 static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 265 203 struct io_kiocb *req) 266 204 { 205 + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; 267 206 struct io_uring_cqe *cqe; 268 207 269 208 /* 270 - * If we can't get a cq entry, userspace overflowed the 271 - * submission (by quite a lot). Increment the overflow count in 272 - * the ring. 209 + * If we can't get a cq entry, userspace overflowed the submission 210 + * (by quite a lot). 273 211 */ 274 - if (unlikely(!io_get_cqe(ctx, &cqe))) 212 + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) 275 213 return false; 276 214 277 - 278 215 memcpy(cqe, &req->cqe, sizeof(*cqe)); 279 - if (ctx->flags & IORING_SETUP_CQE32) { 216 + if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { 280 217 memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 281 218 memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 282 219 } ··· 300 239 req->cqe.flags = cflags; 301 240 } 302 241 242 + static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) 243 + { 244 + if (ctx->flags & IORING_SETUP_CQE_MIXED) 245 + return IORING_CQE_F_32; 246 + return 0; 247 + } 248 + 249 + static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, 250 + __u64 extra1, __u64 extra2) 251 + { 252 + req->cqe.res = res; 253 + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); 254 + req->big_cqe.extra1 = extra1; 255 + req->big_cqe.extra2 = extra2; 256 + } 257 + 303 258 static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, 304 259 struct io_kiocb *req) 305 260 { ··· 335 258 static inline bool req_has_async_data(struct io_kiocb *req) 336 259 { 337 260 return req->flags & REQ_F_ASYNC_DATA; 261 + } 262 + 263 + static inline void io_req_async_data_clear(struct io_kiocb *req, 264 + io_req_flags_t extra_flags) 265 + { 266 + req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags); 267 + req->async_data = NULL; 268 + } 269 + 270 + static inline void io_req_async_data_free(struct io_kiocb *req) 271 + { 272 + kfree(req->async_data); 273 + io_req_async_data_clear(req, 0); 338 274 } 339 275 340 276 static inline void io_put_file(struct io_kiocb *req)

+36 -31

io_uring/kbuf.c

··· 155 155 return 1; 156 156 } 157 157 158 - static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 159 - struct io_buffer_list *bl, 160 - unsigned int issue_flags) 158 + static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, 159 + struct io_buffer_list *bl, 160 + unsigned int issue_flags) 161 161 { 162 162 struct io_uring_buf_ring *br = bl->buf_ring; 163 163 __u16 tail, head = bl->head; 164 + struct io_br_sel sel = { }; 164 165 struct io_uring_buf *buf; 165 - void __user *ret; 166 166 u32 buf_len; 167 167 168 168 tail = smp_load_acquire(&br->tail); 169 169 if (unlikely(tail == head)) 170 - return NULL; 170 + return sel; 171 171 172 172 if (head + 1 == tail) 173 173 req->flags |= REQ_F_BL_EMPTY; ··· 177 177 if (*len == 0 || *len > buf_len) 178 178 *len = buf_len; 179 179 req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; 180 - req->buf_list = bl; 181 180 req->buf_index = buf->bid; 182 - ret = u64_to_user_ptr(buf->addr); 181 + sel.buf_list = bl; 182 + sel.addr = u64_to_user_ptr(buf->addr); 183 183 184 184 if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { 185 185 /* ··· 192 192 * the transfer completes (or if we get -EAGAIN and must poll of 193 193 * retry). 194 194 */ 195 - io_kbuf_commit(req, bl, *len, 1); 196 - req->buf_list = NULL; 195 + io_kbuf_commit(req, sel.buf_list, *len, 1); 196 + sel.buf_list = NULL; 197 197 } 198 - return ret; 198 + return sel; 199 199 } 200 200 201 - void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 202 - unsigned buf_group, unsigned int issue_flags) 201 + struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, 202 + unsigned buf_group, unsigned int issue_flags) 203 203 { 204 204 struct io_ring_ctx *ctx = req->ctx; 205 + struct io_br_sel sel = { }; 205 206 struct io_buffer_list *bl; 206 - void __user *ret = NULL; 207 207 208 208 io_ring_submit_lock(req->ctx, issue_flags); 209 209 210 210 bl = io_buffer_get_list(ctx, buf_group); 211 211 if (likely(bl)) { 212 212 if (bl->flags & IOBL_BUF_RING) 213 - ret = io_ring_buffer_select(req, len, bl, issue_flags); 213 + sel = io_ring_buffer_select(req, len, bl, issue_flags); 214 214 else 215 - ret = io_provided_buffer_select(req, len, bl); 215 + sel.addr = io_provided_buffer_select(req, len, bl); 216 216 } 217 217 io_ring_submit_unlock(req->ctx, issue_flags); 218 - return ret; 218 + return sel; 219 219 } 220 220 221 221 /* cap it at a reasonable 256, will be one page even for 4K */ ··· 300 300 req->flags |= REQ_F_BL_EMPTY; 301 301 302 302 req->flags |= REQ_F_BUFFER_RING; 303 - req->buf_list = bl; 304 303 return iov - arg->iovs; 305 304 } 306 305 307 306 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, 308 - unsigned int issue_flags) 307 + struct io_br_sel *sel, unsigned int issue_flags) 309 308 { 310 309 struct io_ring_ctx *ctx = req->ctx; 311 - struct io_buffer_list *bl; 312 310 int ret = -ENOENT; 313 311 314 312 io_ring_submit_lock(ctx, issue_flags); 315 - bl = io_buffer_get_list(ctx, arg->buf_group); 316 - if (unlikely(!bl)) 313 + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); 314 + if (unlikely(!sel->buf_list)) 317 315 goto out_unlock; 318 316 319 - if (bl->flags & IOBL_BUF_RING) { 320 - ret = io_ring_buffers_peek(req, arg, bl); 317 + if (sel->buf_list->flags & IOBL_BUF_RING) { 318 + ret = io_ring_buffers_peek(req, arg, sel->buf_list); 321 319 /* 322 320 * Don't recycle these buffers if we need to go through poll. 323 321 * Nobody else can use them anyway, and holding on to provided ··· 325 327 */ 326 328 if (ret > 0) { 327 329 req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; 328 - io_kbuf_commit(req, bl, arg->out_len, ret); 330 + io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); 329 331 } 330 332 } else { 331 - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); 333 + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); 332 334 } 333 335 out_unlock: 334 - io_ring_submit_unlock(ctx, issue_flags); 336 + if (issue_flags & IO_URING_F_UNLOCKED) { 337 + sel->buf_list = NULL; 338 + mutex_unlock(&ctx->uring_lock); 339 + } 335 340 return ret; 336 341 } 337 342 338 - int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) 343 + int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, 344 + struct io_br_sel *sel) 339 345 { 340 346 struct io_ring_ctx *ctx = req->ctx; 341 347 struct io_buffer_list *bl; ··· 355 353 ret = io_ring_buffers_peek(req, arg, bl); 356 354 if (ret > 0) 357 355 req->flags |= REQ_F_BUFFERS_COMMIT; 356 + sel->buf_list = bl; 358 357 return ret; 359 358 } 360 359 361 360 /* don't support multiple buffer selections for legacy */ 361 + sel->buf_list = NULL; 362 362 return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); 363 363 } 364 364 365 - static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 365 + static inline bool __io_put_kbuf_ring(struct io_kiocb *req, 366 + struct io_buffer_list *bl, int len, int nr) 366 367 { 367 - struct io_buffer_list *bl = req->buf_list; 368 368 bool ret = true; 369 369 370 370 if (bl) ··· 376 372 return ret; 377 373 } 378 374 379 - unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) 375 + unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, 376 + int len, int nbufs) 380 377 { 381 378 unsigned int ret; 382 379 ··· 388 383 return ret; 389 384 } 390 385 391 - if (!__io_put_kbuf_ring(req, len, nbufs)) 386 + if (!__io_put_kbuf_ring(req, bl, len, nbufs)) 392 387 ret |= IORING_CQE_F_BUF_MORE; 393 388 return ret; 394 389 }

+18 -21

io_uring/kbuf.h

··· 62 62 unsigned short partial_map; 63 63 }; 64 64 65 - void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 66 - unsigned buf_group, unsigned int issue_flags); 65 + struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, 66 + unsigned buf_group, unsigned int issue_flags); 67 67 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, 68 - unsigned int issue_flags); 69 - int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); 68 + struct io_br_sel *sel, unsigned int issue_flags); 69 + int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, 70 + struct io_br_sel *sel); 70 71 void io_destroy_buffers(struct io_ring_ctx *ctx); 71 72 72 73 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); ··· 81 80 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); 82 81 void io_kbuf_drop_legacy(struct io_kiocb *req); 83 82 84 - unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); 83 + unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, 84 + int len, int nbufs); 85 85 bool io_kbuf_commit(struct io_kiocb *req, 86 86 struct io_buffer_list *bl, int len, int nr); 87 87 88 88 struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, 89 89 unsigned int bgid); 90 90 91 - static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) 91 + static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, 92 + struct io_buffer_list *bl) 92 93 { 93 - /* 94 - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear 95 - * the flag and hence ensure that bl->head doesn't get incremented. 96 - * If the tail has already been incremented, hang on to it. 97 - * The exception is partial io, that case we should increment bl->head 98 - * to monopolize the buffer. 99 - */ 100 - if (req->buf_list) { 94 + if (bl) { 101 95 req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); 102 96 return true; 103 97 } ··· 106 110 return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); 107 111 } 108 112 109 - static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) 113 + static inline bool io_kbuf_recycle(struct io_kiocb *req, struct io_buffer_list *bl, 114 + unsigned issue_flags) 110 115 { 111 116 if (req->flags & REQ_F_BL_NO_RECYCLE) 112 117 return false; 118 + if (req->flags & REQ_F_BUFFER_RING) 119 + return io_kbuf_recycle_ring(req, bl); 113 120 if (req->flags & REQ_F_BUFFER_SELECTED) 114 121 return io_kbuf_recycle_legacy(req, issue_flags); 115 - if (req->flags & REQ_F_BUFFER_RING) 116 - return io_kbuf_recycle_ring(req); 117 122 return false; 118 123 } 119 124 120 125 static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, 121 - unsigned issue_flags) 126 + struct io_buffer_list *bl) 122 127 { 123 128 if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 124 129 return 0; 125 - return __io_put_kbufs(req, len, 1); 130 + return __io_put_kbufs(req, bl, len, 1); 126 131 } 127 132 128 133 static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, 129 - int nbufs, unsigned issue_flags) 134 + struct io_buffer_list *bl, int nbufs) 130 135 { 131 136 if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 132 137 return 0; 133 - return __io_put_kbufs(req, len, nbufs); 138 + return __io_put_kbufs(req, bl, len, nbufs); 134 139 } 135 140 #endif

+75 -85

io_uring/net.c

··· 10 10 11 11 #include <uapi/linux/io_uring.h> 12 12 13 + #include "filetable.h" 13 14 #include "io_uring.h" 14 15 #include "kbuf.h" 15 16 #include "alloc_cache.h" ··· 179 178 if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) 180 179 io_vec_free(&hdr->vec); 181 180 182 - if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 183 - req->async_data = NULL; 184 - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 185 - } 181 + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) 182 + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); 186 183 } 187 184 188 185 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) ··· 432 433 if (req->opcode == IORING_OP_SENDMSG) 433 434 return -EINVAL; 434 435 sr->msg_flags |= MSG_WAITALL; 435 - req->buf_list = NULL; 436 436 req->flags |= REQ_F_MULTISHOT; 437 437 } 438 438 ··· 492 494 return nbufs; 493 495 } 494 496 495 - static int io_net_kbuf_recyle(struct io_kiocb *req, 497 + static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl, 496 498 struct io_async_msghdr *kmsg, int len) 497 499 { 498 500 req->flags |= REQ_F_BL_NO_RECYCLE; 499 501 if (req->flags & REQ_F_BUFFERS_COMMIT) 500 - io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len)); 502 + io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len)); 501 503 return IOU_RETRY; 502 504 } 503 505 504 - static inline bool io_send_finish(struct io_kiocb *req, int *ret, 506 + static inline bool io_send_finish(struct io_kiocb *req, 505 507 struct io_async_msghdr *kmsg, 506 - unsigned issue_flags) 508 + struct io_br_sel *sel) 507 509 { 508 510 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 509 - bool bundle_finished = *ret <= 0; 511 + bool bundle_finished = sel->val <= 0; 510 512 unsigned int cflags; 511 513 512 514 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 513 - cflags = io_put_kbuf(req, *ret, issue_flags); 515 + cflags = io_put_kbuf(req, sel->val, sel->buf_list); 514 516 goto finish; 515 517 } 516 518 517 - cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 519 + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); 518 520 519 521 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 520 522 goto finish; ··· 523 525 * Fill CQE for this receive and see if we should keep trying to 524 526 * receive from this socket. 525 527 */ 526 - if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 528 + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { 527 529 io_mshot_prep_retry(req, kmsg); 528 530 return false; 529 531 } 530 532 531 533 /* Otherwise stop bundle and use the current result. */ 532 534 finish: 533 - io_req_set_res(req, *ret, cflags); 534 - *ret = IOU_COMPLETE; 535 + io_req_set_res(req, sel->val, cflags); 536 + sel->val = IOU_COMPLETE; 535 537 return true; 536 538 } 537 539 ··· 569 571 kmsg->msg.msg_controllen = 0; 570 572 kmsg->msg.msg_control = NULL; 571 573 sr->done_io += ret; 572 - return io_net_kbuf_recyle(req, kmsg, ret); 574 + return -EAGAIN; 573 575 } 574 576 if (ret == -ERESTARTSYS) 575 577 ret = -EINTR; ··· 585 587 } 586 588 587 589 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 588 - struct io_async_msghdr *kmsg) 590 + struct io_br_sel *sel, struct io_async_msghdr *kmsg) 589 591 { 590 592 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 591 - 592 - int ret; 593 593 struct buf_sel_arg arg = { 594 594 .iovs = &kmsg->fast_iov, 595 595 .max_len = min_not_zero(sr->len, INT_MAX), 596 596 .nr_iovs = 1, 597 597 .buf_group = sr->buf_group, 598 598 }; 599 + int ret; 599 600 600 601 if (kmsg->vec.iovec) { 601 602 arg.nr_iovs = kmsg->vec.nr; ··· 607 610 else 608 611 arg.mode |= KBUF_MODE_EXPAND; 609 612 610 - ret = io_buffers_select(req, &arg, issue_flags); 613 + ret = io_buffers_select(req, &arg, sel, issue_flags); 611 614 if (unlikely(ret < 0)) 612 615 return ret; 613 616 ··· 636 639 { 637 640 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 638 641 struct io_async_msghdr *kmsg = req->async_data; 642 + struct io_br_sel sel = { }; 639 643 struct socket *sock; 640 644 unsigned flags; 641 645 int min_ret = 0; ··· 655 657 flags |= MSG_DONTWAIT; 656 658 657 659 retry_bundle: 660 + sel.buf_list = NULL; 658 661 if (io_do_buffer_select(req)) { 659 - ret = io_send_select_buffer(req, issue_flags, kmsg); 662 + ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); 660 663 if (ret) 661 664 return ret; 662 665 } ··· 681 682 sr->len -= ret; 682 683 sr->buf += ret; 683 684 sr->done_io += ret; 684 - return io_net_kbuf_recyle(req, kmsg, ret); 685 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 685 686 } 686 687 if (ret == -ERESTARTSYS) 687 688 ret = -EINTR; ··· 692 693 else if (sr->done_io) 693 694 ret = sr->done_io; 694 695 695 - if (!io_send_finish(req, &ret, kmsg, issue_flags)) 696 + sel.val = ret; 697 + if (!io_send_finish(req, kmsg, &sel)) 696 698 goto retry_bundle; 697 699 698 700 io_req_msg_cleanup(req, issue_flags); 699 - return ret; 701 + return sel.val; 700 702 } 701 703 702 704 static int io_recvmsg_mshot_prep(struct io_kiocb *req, ··· 794 794 req->flags |= REQ_F_NOWAIT; 795 795 if (sr->msg_flags & MSG_ERRQUEUE) 796 796 req->flags |= REQ_F_CLEAR_POLLIN; 797 - if (req->flags & REQ_F_BUFFER_SELECT) { 798 - /* 799 - * Store the buffer group for this multishot receive separately, 800 - * as if we end up doing an io-wq based issue that selects a 801 - * buffer, it has to be committed immediately and that will 802 - * clear ->buf_list. This means we lose the link to the buffer 803 - * list, and the eventual buffer put on completion then cannot 804 - * restore it. 805 - */ 797 + if (req->flags & REQ_F_BUFFER_SELECT) 806 798 sr->buf_group = req->buf_index; 807 - req->buf_list = NULL; 808 - } 809 799 sr->mshot_total_len = sr->mshot_len = 0; 810 800 if (sr->flags & IORING_RECV_MULTISHOT) { 811 801 if (!(req->flags & REQ_F_BUFFER_SELECT)) ··· 836 846 * Returns true if it is actually finished, or false if it should run 837 847 * again (for multishot). 838 848 */ 839 - static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 849 + static inline bool io_recv_finish(struct io_kiocb *req, 840 850 struct io_async_msghdr *kmsg, 841 - bool mshot_finished, unsigned issue_flags) 851 + struct io_br_sel *sel, bool mshot_finished, 852 + unsigned issue_flags) 842 853 { 843 854 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 844 855 unsigned int cflags = 0; ··· 847 856 if (kmsg->msg.msg_inq > 0) 848 857 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 849 858 850 - if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { 859 + if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { 851 860 /* 852 861 * If sr->len hits zero, the limit has been reached. Mark 853 862 * mshot as finished, and flag MSHOT_DONE as well to prevent 854 863 * a potential bundle from being retried. 855 864 */ 856 - sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len); 865 + sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len); 857 866 if (!sr->mshot_total_len) { 858 867 sr->flags |= IORING_RECV_MSHOT_DONE; 859 868 mshot_finished = true; ··· 861 870 } 862 871 863 872 if (sr->flags & IORING_RECVSEND_BUNDLE) { 864 - size_t this_ret = *ret - sr->done_io; 873 + size_t this_ret = sel->val - sr->done_io; 865 874 866 - cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), 867 - issue_flags); 875 + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); 868 876 if (sr->flags & IORING_RECV_RETRY) 869 877 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 870 - if (sr->mshot_len && *ret >= sr->mshot_len) 878 + if (sr->mshot_len && sel->val >= sr->mshot_len) 871 879 sr->flags |= IORING_RECV_MSHOT_CAP; 872 880 /* bundle with no more immediate buffers, we're done */ 873 881 if (req->flags & REQ_F_BL_EMPTY) ··· 885 895 return false; 886 896 } 887 897 } else { 888 - cflags |= io_put_kbuf(req, *ret, issue_flags); 898 + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); 889 899 } 890 900 891 901 /* ··· 893 903 * receive from this socket. 894 904 */ 895 905 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 896 - io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 897 - *ret = IOU_RETRY; 906 + io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { 907 + sel->val = IOU_RETRY; 898 908 io_mshot_prep_retry(req, kmsg); 899 909 /* Known not-empty or unknown state, retry */ 900 910 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { ··· 906 916 sr->nr_multishot_loops = 0; 907 917 sr->flags &= ~IORING_RECV_MSHOT_CAP; 908 918 if (issue_flags & IO_URING_F_MULTISHOT) 909 - *ret = IOU_REQUEUE; 919 + sel->val = IOU_REQUEUE; 910 920 } 911 921 return true; 912 922 } 913 923 914 924 /* Finish the request / stop multishot. */ 915 925 finish: 916 - io_req_set_res(req, *ret, cflags); 917 - *ret = IOU_COMPLETE; 926 + io_req_set_res(req, sel->val, cflags); 927 + sel->val = IOU_COMPLETE; 918 928 io_req_msg_cleanup(req, issue_flags); 919 929 return true; 920 930 } ··· 1007 1017 { 1008 1018 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1009 1019 struct io_async_msghdr *kmsg = req->async_data; 1020 + struct io_br_sel sel = { }; 1010 1021 struct socket *sock; 1011 1022 unsigned flags; 1012 1023 int ret, min_ret = 0; ··· 1027 1036 flags |= MSG_DONTWAIT; 1028 1037 1029 1038 retry_multishot: 1039 + sel.buf_list = NULL; 1030 1040 if (io_do_buffer_select(req)) { 1031 - void __user *buf; 1032 1041 size_t len = sr->len; 1033 1042 1034 - buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); 1035 - if (!buf) 1043 + sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); 1044 + if (!sel.addr) 1036 1045 return -ENOBUFS; 1037 1046 1038 1047 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1039 - ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 1048 + ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); 1040 1049 if (ret) { 1041 - io_kbuf_recycle(req, issue_flags); 1050 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1042 1051 return ret; 1043 1052 } 1044 1053 } 1045 1054 1046 - iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 1055 + iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len); 1047 1056 } 1048 1057 1049 1058 kmsg->msg.msg_get_inq = 1; ··· 1062 1071 1063 1072 if (ret < min_ret) { 1064 1073 if (ret == -EAGAIN && force_nonblock) { 1065 - if (issue_flags & IO_URING_F_MULTISHOT) 1066 - io_kbuf_recycle(req, issue_flags); 1067 - 1074 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1068 1075 return IOU_RETRY; 1069 1076 } 1070 1077 if (ret > 0 && io_net_retry(sock, flags)) { 1071 1078 sr->done_io += ret; 1072 - return io_net_kbuf_recyle(req, kmsg, ret); 1079 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 1073 1080 } 1074 1081 if (ret == -ERESTARTSYS) 1075 1082 ret = -EINTR; ··· 1081 1092 else if (sr->done_io) 1082 1093 ret = sr->done_io; 1083 1094 else 1084 - io_kbuf_recycle(req, issue_flags); 1095 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1085 1096 1086 - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1097 + sel.val = ret; 1098 + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) 1087 1099 goto retry_multishot; 1088 1100 1089 - return ret; 1101 + return sel.val; 1090 1102 } 1091 1103 1092 1104 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1093 - size_t *len, unsigned int issue_flags) 1105 + struct io_br_sel *sel, unsigned int issue_flags) 1094 1106 { 1095 1107 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1096 1108 int ret; ··· 1116 1126 arg.mode |= KBUF_MODE_FREE; 1117 1127 } 1118 1128 1119 - if (*len) 1120 - arg.max_len = *len; 1129 + if (sel->val) 1130 + arg.max_len = sel->val; 1121 1131 else if (kmsg->msg.msg_inq > 1) 1122 - arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq); 1132 + arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq); 1123 1133 1124 1134 /* if mshot limited, ensure we don't go over */ 1125 1135 if (sr->flags & IORING_RECV_MSHOT_LIM) 1126 1136 arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len); 1127 - ret = io_buffers_peek(req, &arg); 1137 + ret = io_buffers_peek(req, &arg, sel); 1128 1138 if (unlikely(ret < 0)) 1129 1139 return ret; 1130 1140 ··· 1145 1155 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1146 1156 arg.out_len); 1147 1157 } else { 1148 - void __user *buf; 1158 + size_t len = sel->val; 1149 1159 1150 - *len = sr->len; 1151 - buf = io_buffer_select(req, len, sr->buf_group, issue_flags); 1152 - if (!buf) 1160 + *sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); 1161 + if (!sel->addr) 1153 1162 return -ENOBUFS; 1154 - sr->buf = buf; 1155 - sr->len = *len; 1163 + sr->buf = sel->addr; 1164 + sr->len = len; 1156 1165 map_ubuf: 1157 1166 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1158 1167 &kmsg->msg.msg_iter); ··· 1166 1177 { 1167 1178 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1168 1179 struct io_async_msghdr *kmsg = req->async_data; 1180 + struct io_br_sel sel; 1169 1181 struct socket *sock; 1170 1182 unsigned flags; 1171 1183 int ret, min_ret = 0; 1172 1184 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1173 - size_t len = sr->len; 1174 1185 bool mshot_finished; 1175 1186 1176 1187 if (!(req->flags & REQ_F_POLLED) && ··· 1186 1197 flags |= MSG_DONTWAIT; 1187 1198 1188 1199 retry_multishot: 1200 + sel.buf_list = NULL; 1189 1201 if (io_do_buffer_select(req)) { 1190 - ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1191 - if (unlikely(ret)) { 1202 + sel.val = sr->len; 1203 + ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); 1204 + if (unlikely(ret < 0)) { 1192 1205 kmsg->msg.msg_inq = -1; 1193 1206 goto out_free; 1194 1207 } ··· 1206 1215 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1207 1216 if (ret < min_ret) { 1208 1217 if (ret == -EAGAIN && force_nonblock) { 1209 - if (issue_flags & IO_URING_F_MULTISHOT) 1210 - io_kbuf_recycle(req, issue_flags); 1211 - 1218 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1212 1219 return IOU_RETRY; 1213 1220 } 1214 1221 if (ret > 0 && io_net_retry(sock, flags)) { 1215 1222 sr->len -= ret; 1216 1223 sr->buf += ret; 1217 1224 sr->done_io += ret; 1218 - return io_net_kbuf_recyle(req, kmsg, ret); 1225 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 1219 1226 } 1220 1227 if (ret == -ERESTARTSYS) 1221 1228 ret = -EINTR; ··· 1229 1240 else if (sr->done_io) 1230 1241 ret = sr->done_io; 1231 1242 else 1232 - io_kbuf_recycle(req, issue_flags); 1243 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1233 1244 1234 - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1245 + sel.val = ret; 1246 + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) 1235 1247 goto retry_multishot; 1236 1248 1237 - return ret; 1249 + return sel.val; 1238 1250 } 1239 1251 1240 1252 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ··· 1495 1505 zc->len -= ret; 1496 1506 zc->buf += ret; 1497 1507 zc->done_io += ret; 1498 - return io_net_kbuf_recyle(req, kmsg, ret); 1508 + return -EAGAIN; 1499 1509 } 1500 1510 if (ret == -ERESTARTSYS) 1501 1511 ret = -EINTR; ··· 1565 1575 1566 1576 if (ret > 0 && io_net_retry(sock, flags)) { 1567 1577 sr->done_io += ret; 1568 - return io_net_kbuf_recyle(req, kmsg, ret); 1578 + return -EAGAIN; 1569 1579 } 1570 1580 if (ret == -ERESTARTSYS) 1571 1581 ret = -EINTR;

+15 -2

io_uring/nop.c

··· 17 17 int result; 18 18 int fd; 19 19 unsigned int flags; 20 + __u64 extra1; 21 + __u64 extra2; 20 22 }; 21 23 22 24 #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ 23 25 IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ 24 - IORING_NOP_TW) 26 + IORING_NOP_TW | IORING_NOP_CQE32) 25 27 26 28 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 27 29 { ··· 43 41 nop->fd = -1; 44 42 if (nop->flags & IORING_NOP_FIXED_BUFFER) 45 43 req->buf_index = READ_ONCE(sqe->buf_index); 44 + if (nop->flags & IORING_NOP_CQE32) { 45 + struct io_ring_ctx *ctx = req->ctx; 46 + 47 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 48 + return -EINVAL; 49 + nop->extra1 = READ_ONCE(sqe->off); 50 + nop->extra2 = READ_ONCE(sqe->addr); 51 + } 46 52 return 0; 47 53 } 48 54 ··· 78 68 done: 79 69 if (ret < 0) 80 70 req_set_fail(req); 81 - io_req_set_res(req, nop->result, 0); 71 + if (nop->flags & IORING_NOP_CQE32) 72 + io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); 73 + else 74 + io_req_set_res(req, nop->result, 0); 82 75 if (nop->flags & IORING_NOP_TW) { 83 76 req->io_task_work.func = io_req_task_complete; 84 77 io_req_task_work_add(req);

+5

io_uring/notif.c

··· 14 14 static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) 15 15 { 16 16 struct io_notif_data *nd = io_notif_to_data(notif); 17 + struct io_ring_ctx *ctx = notif->ctx; 18 + 19 + lockdep_assert_held(&ctx->uring_lock); 17 20 18 21 do { 19 22 notif = cmd_to_io_kiocb(nd); 20 23 24 + if (WARN_ON_ONCE(ctx != notif->ctx)) 25 + return; 21 26 lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); 22 27 23 28 if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used))

+1

io_uring/opdef.c

··· 413 413 #endif 414 414 }, 415 415 [IORING_OP_URING_CMD] = { 416 + .buffer_select = 1, 416 417 .needs_file = 1, 417 418 .plug = 1, 418 419 .iopoll = 1,

+1

io_uring/openclose.c

··· 14 14 15 15 #include "../fs/internal.h" 16 16 17 + #include "filetable.h" 17 18 #include "io_uring.h" 18 19 #include "rsrc.h" 19 20 #include "openclose.h"

-4

io_uring/poll.c

··· 316 316 317 317 ret = io_poll_check_events(req, tw); 318 318 if (ret == IOU_POLL_NO_ACTION) { 319 - io_kbuf_recycle(req, 0); 320 319 return; 321 320 } else if (ret == IOU_POLL_REQUEUE) { 322 - io_kbuf_recycle(req, 0); 323 321 __io_poll_execute(req, 0); 324 322 return; 325 323 } ··· 683 685 req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL); 684 686 req->flags |= REQ_F_POLLED; 685 687 ipt.pt._qproc = io_async_queue_proc; 686 - 687 - io_kbuf_recycle(req, issue_flags); 688 688 689 689 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); 690 690 if (ret)

+101

io_uring/query.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "linux/io_uring/query.h" 4 + 5 + #include "query.h" 6 + #include "io_uring.h" 7 + 8 + #define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) 9 + #define IO_MAX_QUERY_ENTRIES 1000 10 + 11 + static ssize_t io_query_ops(void *data) 12 + { 13 + struct io_uring_query_opcode *e = data; 14 + 15 + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); 16 + 17 + e->nr_request_opcodes = IORING_OP_LAST; 18 + e->nr_register_opcodes = IORING_REGISTER_LAST; 19 + e->feature_flags = IORING_FEAT_FLAGS; 20 + e->ring_setup_flags = IORING_SETUP_FLAGS; 21 + e->enter_flags = IORING_ENTER_FLAGS; 22 + e->sqe_flags = SQE_VALID_FLAGS; 23 + return sizeof(*e); 24 + } 25 + 26 + static int io_handle_query_entry(struct io_ring_ctx *ctx, 27 + void *data, void __user *uhdr, 28 + u64 *next_entry) 29 + { 30 + struct io_uring_query_hdr hdr; 31 + size_t usize, res_size = 0; 32 + ssize_t ret = -EINVAL; 33 + void __user *udata; 34 + 35 + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) 36 + return -EFAULT; 37 + usize = hdr.size; 38 + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); 39 + udata = u64_to_user_ptr(hdr.query_data); 40 + 41 + if (hdr.query_op >= __IO_URING_QUERY_MAX) { 42 + ret = -EOPNOTSUPP; 43 + goto out; 44 + } 45 + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) 46 + goto out; 47 + if (copy_from_user(data, udata, hdr.size)) 48 + return -EFAULT; 49 + 50 + switch (hdr.query_op) { 51 + case IO_URING_QUERY_OPCODES: 52 + ret = io_query_ops(data); 53 + break; 54 + } 55 + 56 + if (ret >= 0) { 57 + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) 58 + return -EFAULT; 59 + res_size = ret; 60 + ret = 0; 61 + } 62 + out: 63 + hdr.result = ret; 64 + hdr.size = min_t(size_t, usize, res_size); 65 + 66 + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) 67 + return -EFAULT; 68 + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) 69 + return -EFAULT; 70 + *next_entry = hdr.next_entry; 71 + return 0; 72 + } 73 + 74 + int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 75 + { 76 + char entry_buffer[IO_MAX_QUERY_SIZE]; 77 + void __user *uhdr = arg; 78 + int ret, nr = 0; 79 + 80 + memset(entry_buffer, 0, sizeof(entry_buffer)); 81 + 82 + if (nr_args) 83 + return -EINVAL; 84 + 85 + while (uhdr) { 86 + u64 next_hdr; 87 + 88 + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); 89 + if (ret) 90 + return ret; 91 + uhdr = u64_to_user_ptr(next_hdr); 92 + 93 + /* Have some limit to avoid a potential cycle */ 94 + if (++nr >= IO_MAX_QUERY_ENTRIES) 95 + return -ERANGE; 96 + if (fatal_signal_pending(current)) 97 + return -EINTR; 98 + cond_resched(); 99 + } 100 + return 0; 101 + }

+9

io_uring/query.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IORING_QUERY_H 3 + #define IORING_QUERY_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); 8 + 9 + #endif

+34 -26

io_uring/register.c

··· 18 18 #include <linux/io_uring.h> 19 19 #include <linux/io_uring_types.h> 20 20 21 + #include "filetable.h" 21 22 #include "io_uring.h" 22 23 #include "opdef.h" 23 24 #include "tctx.h" ··· 32 31 #include "msg_ring.h" 33 32 #include "memmap.h" 34 33 #include "zcrx.h" 34 + #include "query.h" 35 35 36 36 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 37 37 IORING_REGISTER_LAST + IORING_OP_LAST) ··· 48 46 nr_args = IORING_OP_LAST; 49 47 50 48 size = struct_size(p, ops, nr_args); 51 - p = kzalloc(size, GFP_KERNEL); 52 - if (!p) 53 - return -ENOMEM; 54 - 55 - ret = -EFAULT; 56 - if (copy_from_user(p, arg, size)) 57 - goto out; 49 + p = memdup_user(arg, size); 50 + if (IS_ERR(p)) 51 + return PTR_ERR(p); 58 52 ret = -EINVAL; 59 53 if (memchr_inv(p, 0, size)) 60 54 goto out; ··· 394 396 395 397 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 396 398 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 397 - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 399 + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 400 + IORING_SETUP_CQE_MIXED) 398 401 399 402 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 400 403 { ··· 406 407 struct io_uring_params p; 407 408 int ret; 408 409 409 - /* for single issuer, must be owner resizing */ 410 - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && 411 - current != ctx->submitter_task) 412 - return -EEXIST; 413 410 /* limited to DEFER_TASKRUN for now */ 414 411 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 415 412 return -EINVAL; ··· 830 835 break; 831 836 ret = io_register_mem_region(ctx, arg); 832 837 break; 838 + case IORING_REGISTER_QUERY: 839 + ret = io_query(ctx, arg, nr_args); 840 + break; 841 + case IORING_REGISTER_ZCRX_REFILL: 842 + ret = io_zcrx_return_bufs(ctx, arg, nr_args); 843 + break; 833 844 default: 834 845 ret = -EINVAL; 835 846 break; ··· 878 877 return ERR_PTR(-EOPNOTSUPP); 879 878 } 880 879 880 + static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) 881 + { 882 + struct io_uring_sqe sqe; 883 + 884 + if (!arg || nr_args != 1) 885 + return -EINVAL; 886 + if (copy_from_user(&sqe, arg, sizeof(sqe))) 887 + return -EFAULT; 888 + /* no flags supported */ 889 + if (sqe.flags) 890 + return -EINVAL; 891 + if (sqe.opcode != IORING_OP_MSG_RING) 892 + return -EINVAL; 893 + 894 + return io_uring_sync_msg_ring(&sqe); 895 + } 896 + 881 897 /* 882 898 * "blind" registration opcodes are ones where there's no ring given, and 883 899 * hence the source fd must be -1. ··· 903 885 unsigned int nr_args) 904 886 { 905 887 switch (opcode) { 906 - case IORING_REGISTER_SEND_MSG_RING: { 907 - struct io_uring_sqe sqe; 908 - 909 - if (!arg || nr_args != 1) 910 - return -EINVAL; 911 - if (copy_from_user(&sqe, arg, sizeof(sqe))) 912 - return -EFAULT; 913 - /* no flags supported */ 914 - if (sqe.flags) 915 - return -EINVAL; 916 - if (sqe.opcode == IORING_OP_MSG_RING) 917 - return io_uring_sync_msg_ring(&sqe); 918 - } 888 + case IORING_REGISTER_SEND_MSG_RING: 889 + return io_uring_register_send_msg_ring(arg, nr_args); 890 + case IORING_REGISTER_QUERY: 891 + return io_query(NULL, arg, nr_args); 919 892 } 920 - 921 893 return -EINVAL; 922 894 } 923 895

+8

io_uring/rsrc.c

··· 13 13 14 14 #include <uapi/linux/io_uring.h> 15 15 16 + #include "filetable.h" 16 17 #include "io_uring.h" 17 18 #include "openclose.h" 18 19 #include "rsrc.h" ··· 1300 1299 if (src_ctx != ctx) { 1301 1300 mutex_unlock(&ctx->uring_lock); 1302 1301 lock_two_rings(ctx, src_ctx); 1302 + 1303 + if (src_ctx->submitter_task && 1304 + src_ctx->submitter_task != current) { 1305 + ret = -EEXIST; 1306 + goto out; 1307 + } 1303 1308 } 1304 1309 1305 1310 ret = io_clone_buffers(ctx, src_ctx, &buf); 1306 1311 1312 + out: 1307 1313 if (src_ctx != ctx) 1308 1314 mutex_unlock(&src_ctx->uring_lock); 1309 1315

+35 -28

io_uring/rw.c

··· 15 15 16 16 #include <uapi/linux/io_uring.h> 17 17 18 + #include "filetable.h" 18 19 #include "io_uring.h" 19 20 #include "opdef.h" 20 21 #include "kbuf.h" ··· 108 107 } 109 108 110 109 static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, 111 - struct io_async_rw *io, 112 - unsigned int issue_flags) 110 + struct io_async_rw *io, struct io_br_sel *sel, 111 + unsigned int issue_flags) 113 112 { 114 113 const struct io_issue_def *def = &io_issue_defs[req->opcode]; 115 114 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 116 - void __user *buf = u64_to_user_ptr(rw->addr); 117 115 size_t sqe_len = rw->len; 118 116 117 + sel->addr = u64_to_user_ptr(rw->addr); 119 118 if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) 120 - return io_import_vec(ddir, req, io, buf, sqe_len); 119 + return io_import_vec(ddir, req, io, sel->addr, sqe_len); 121 120 122 121 if (io_do_buffer_select(req)) { 123 - buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); 124 - if (!buf) 122 + *sel = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); 123 + if (!sel->addr) 125 124 return -ENOBUFS; 126 - rw->addr = (unsigned long) buf; 125 + rw->addr = (unsigned long) sel->addr; 127 126 rw->len = sqe_len; 128 127 } 129 - return import_ubuf(ddir, buf, sqe_len, &io->iter); 128 + return import_ubuf(ddir, sel->addr, sqe_len, &io->iter); 130 129 } 131 130 132 131 static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, 133 132 struct io_async_rw *io, 133 + struct io_br_sel *sel, 134 134 unsigned int issue_flags) 135 135 { 136 136 int ret; 137 137 138 - ret = __io_import_rw_buffer(rw, req, io, issue_flags); 138 + ret = __io_import_rw_buffer(rw, req, io, sel, issue_flags); 139 139 if (unlikely(ret < 0)) 140 140 return ret; 141 141 ··· 155 153 if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) 156 154 io_vec_free(&rw->vec); 157 155 158 - if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { 159 - req->async_data = NULL; 160 - req->flags &= ~REQ_F_ASYNC_DATA; 161 - } 156 + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) 157 + io_req_async_data_clear(req, 0); 162 158 } 163 159 164 160 static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) ··· 306 306 307 307 static int io_rw_do_import(struct io_kiocb *req, int ddir) 308 308 { 309 + struct io_br_sel sel = { }; 310 + 309 311 if (io_do_buffer_select(req)) 310 312 return 0; 311 313 312 - return io_import_rw_buffer(ddir, req, req->async_data, 0); 314 + return io_import_rw_buffer(ddir, req, req->async_data, &sel, 0); 313 315 } 314 316 315 317 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, ··· 578 576 io_req_io_end(req); 579 577 580 578 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 581 - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); 579 + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 582 580 583 581 io_req_rw_cleanup(req, 0); 584 582 io_req_task_complete(req, tw); ··· 647 645 } 648 646 649 647 static int kiocb_done(struct io_kiocb *req, ssize_t ret, 650 - unsigned int issue_flags) 648 + struct io_br_sel *sel, unsigned int issue_flags) 651 649 { 652 650 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 653 651 unsigned final_ret = io_fixup_rw_res(req, ret); ··· 661 659 * from the submission path. 662 660 */ 663 661 io_req_io_end(req); 664 - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); 662 + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list)); 665 663 io_req_rw_cleanup(req, issue_flags); 666 664 return IOU_COMPLETE; 667 665 } else { ··· 904 902 return 0; 905 903 } 906 904 907 - static int __io_read(struct io_kiocb *req, unsigned int issue_flags) 905 + static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, 906 + unsigned int issue_flags) 908 907 { 909 908 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 910 909 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); ··· 919 916 if (unlikely(ret)) 920 917 return ret; 921 918 } else if (io_do_buffer_select(req)) { 922 - ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); 919 + ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags); 923 920 if (unlikely(ret < 0)) 924 921 return ret; 925 922 } ··· 1021 1018 1022 1019 int io_read(struct io_kiocb *req, unsigned int issue_flags) 1023 1020 { 1021 + struct io_br_sel sel = { }; 1024 1022 int ret; 1025 1023 1026 - ret = __io_read(req, issue_flags); 1024 + ret = __io_read(req, &sel, issue_flags); 1027 1025 if (ret >= 0) 1028 - return kiocb_done(req, ret, issue_flags); 1026 + return kiocb_done(req, ret, &sel, issue_flags); 1029 1027 1028 + if (req->flags & REQ_F_BUFFERS_COMMIT) 1029 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1030 1030 return ret; 1031 1031 } 1032 1032 1033 1033 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) 1034 1034 { 1035 1035 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1036 + struct io_br_sel sel = { }; 1036 1037 unsigned int cflags = 0; 1037 1038 int ret; 1038 1039 ··· 1048 1041 1049 1042 /* make it sync, multishot doesn't support async execution */ 1050 1043 rw->kiocb.ki_complete = NULL; 1051 - ret = __io_read(req, issue_flags); 1044 + ret = __io_read(req, &sel, issue_flags); 1052 1045 1053 1046 /* 1054 1047 * If we get -EAGAIN, recycle our buffer and just let normal poll ··· 1059 1052 * Reset rw->len to 0 again to avoid clamping future mshot 1060 1053 * reads, in case the buffer size varies. 1061 1054 */ 1062 - if (io_kbuf_recycle(req, issue_flags)) 1055 + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) 1063 1056 rw->len = 0; 1064 1057 return IOU_RETRY; 1065 1058 } else if (ret <= 0) { 1066 - io_kbuf_recycle(req, issue_flags); 1059 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1067 1060 if (ret < 0) 1068 1061 req_set_fail(req); 1069 1062 } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1070 - cflags = io_put_kbuf(req, ret, issue_flags); 1063 + cflags = io_put_kbuf(req, ret, sel.buf_list); 1071 1064 } else { 1072 1065 /* 1073 1066 * Any successful return value will keep the multishot read ··· 1075 1068 * we fail to post a CQE, or multishot is no longer set, then 1076 1069 * jump to the termination path. This request is then done. 1077 1070 */ 1078 - cflags = io_put_kbuf(req, ret, issue_flags); 1071 + cflags = io_put_kbuf(req, ret, sel.buf_list); 1079 1072 rw->len = 0; /* similarly to above, reset len to 0 */ 1080 1073 1081 1074 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { ··· 1204 1197 return -EAGAIN; 1205 1198 } 1206 1199 done: 1207 - return kiocb_done(req, ret2, issue_flags); 1200 + return kiocb_done(req, ret2, NULL, issue_flags); 1208 1201 } else { 1209 1202 ret_eagain: 1210 1203 iov_iter_restore(&io->iter, &io->iter_state); ··· 1372 1365 if (!smp_load_acquire(&req->iopoll_completed)) 1373 1366 break; 1374 1367 nr_events++; 1375 - req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); 1368 + req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); 1376 1369 if (req->opcode != IORING_OP_URING_CMD) 1377 1370 io_req_rw_cleanup(req, 0); 1378 1371 }

+1

io_uring/splice.c

··· 11 11 12 12 #include <uapi/linux/io_uring.h> 13 13 14 + #include "filetable.h" 14 15 #include "io_uring.h" 15 16 #include "splice.h" 16 17

+74 -9

io_uring/uring_cmd.c

··· 11 11 #include "io_uring.h" 12 12 #include "alloc_cache.h" 13 13 #include "rsrc.h" 14 + #include "kbuf.h" 14 15 #include "uring_cmd.h" 15 16 #include "poll.h" 16 17 ··· 37 36 38 37 if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) { 39 38 ioucmd->sqe = NULL; 40 - req->async_data = NULL; 41 - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 39 + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); 42 40 } 43 41 } 44 42 ··· 126 126 } 127 127 128 128 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 129 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 129 + io_uring_cmd_tw_t task_work_cb, 130 130 unsigned flags) 131 131 { 132 132 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ··· 151 151 * Called by consumers of io_uring_cmd, if they originally returned 152 152 * -EIOCBQUEUED upon receiving the command. 153 153 */ 154 - void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, 155 - unsigned issue_flags) 154 + void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, 155 + unsigned issue_flags, bool is_cqe32) 156 156 { 157 157 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 158 158 ··· 165 165 req_set_fail(req); 166 166 167 167 io_req_set_res(req, ret, 0); 168 - if (req->ctx->flags & IORING_SETUP_CQE32) 168 + if (is_cqe32) { 169 + if (req->ctx->flags & IORING_SETUP_CQE_MIXED) 170 + req->cqe.flags |= IORING_CQE_F_32; 169 171 io_req_set_cqe32_extra(req, res2, 0); 172 + } 170 173 io_req_uring_cleanup(req, issue_flags); 171 174 if (req->ctx->flags & IORING_SETUP_IOPOLL) { 172 175 /* order with io_iopoll_req_issued() checking ->iopoll_complete */ ··· 183 180 io_req_task_work_add(req); 184 181 } 185 182 } 186 - EXPORT_SYMBOL_GPL(io_uring_cmd_done); 183 + EXPORT_SYMBOL_GPL(__io_uring_cmd_done); 187 184 188 185 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 189 186 { ··· 197 194 if (ioucmd->flags & ~IORING_URING_CMD_MASK) 198 195 return -EINVAL; 199 196 200 - if (ioucmd->flags & IORING_URING_CMD_FIXED) 197 + if (ioucmd->flags & IORING_URING_CMD_FIXED) { 198 + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) 199 + return -EINVAL; 201 200 req->buf_index = READ_ONCE(sqe->buf_index); 201 + } 202 + 203 + if (!!(ioucmd->flags & IORING_URING_CMD_MULTISHOT) != 204 + !!(req->flags & REQ_F_BUFFER_SELECT)) 205 + return -EINVAL; 202 206 203 207 ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); 204 208 ··· 244 234 245 235 if (ctx->flags & IORING_SETUP_SQE128) 246 236 issue_flags |= IO_URING_F_SQE128; 247 - if (ctx->flags & IORING_SETUP_CQE32) 237 + if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) 248 238 issue_flags |= IO_URING_F_CQE32; 249 239 if (io_is_compat(ctx)) 250 240 issue_flags |= IO_URING_F_COMPAT; ··· 261 251 } 262 252 263 253 ret = file->f_op->uring_cmd(ioucmd, issue_flags); 254 + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { 255 + if (ret >= 0) 256 + return IOU_ISSUE_SKIP_COMPLETE; 257 + } 264 258 if (ret == -EAGAIN) { 265 259 ioucmd->flags |= IORING_URING_CMD_REISSUE; 266 260 return ret; ··· 347 333 return false; 348 334 return io_req_post_cqe32(req, cqe); 349 335 } 336 + 337 + /* 338 + * Work with io_uring_mshot_cmd_post_cqe() together for committing the 339 + * provided buffer upfront 340 + */ 341 + struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, 342 + unsigned buf_group, size_t *len, 343 + unsigned int issue_flags) 344 + { 345 + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 346 + 347 + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) 348 + return (struct io_br_sel) { .val = -EINVAL }; 349 + 350 + if (WARN_ON_ONCE(!io_do_buffer_select(req))) 351 + return (struct io_br_sel) { .val = -EINVAL }; 352 + 353 + return io_buffer_select(req, len, buf_group, issue_flags); 354 + } 355 + EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); 356 + 357 + /* 358 + * Return true if this multishot uring_cmd needs to be completed, otherwise 359 + * the event CQE is posted successfully. 360 + * 361 + * This function must use `struct io_br_sel` returned from 362 + * io_uring_cmd_buffer_select() for committing the buffer in the same 363 + * uring_cmd submission context. 364 + */ 365 + bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, 366 + struct io_br_sel *sel, unsigned int issue_flags) 367 + { 368 + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 369 + unsigned int cflags = 0; 370 + 371 + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) 372 + return true; 373 + 374 + if (sel->val > 0) { 375 + cflags = io_put_kbuf(req, sel->val, sel->buf_list); 376 + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) 377 + return false; 378 + } 379 + 380 + io_kbuf_recycle(req, sel->buf_list, issue_flags); 381 + if (sel->val < 0) 382 + req_set_fail(req); 383 + io_req_set_res(req, sel->val, cflags); 384 + return true; 385 + } 386 + EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe);

+1 -3

io_uring/waitid.c

··· 37 37 struct io_waitid_async *iwa = req->async_data; 38 38 39 39 put_pid(iwa->wo.wo_pid); 40 - kfree(req->async_data); 41 - req->async_data = NULL; 42 - req->flags &= ~REQ_F_ASYNC_DATA; 40 + io_req_async_data_free(req); 43 41 } 44 42 45 43 static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)

+188 -107

io_uring/zcrx.c

··· 26 26 #include "zcrx.h" 27 27 #include "rsrc.h" 28 28 29 + #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 30 + 29 31 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 30 32 31 33 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) ··· 45 43 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 46 44 { 47 45 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 46 + unsigned niov_pages_shift; 48 47 49 48 lockdep_assert(!area->mem.is_dmabuf); 50 49 51 - return area->mem.pages[net_iov_idx(niov)]; 50 + niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; 51 + return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; 52 52 } 53 53 54 54 static int io_populate_area_dma(struct io_zcrx_ifq *ifq, 55 - struct io_zcrx_area *area, 56 - struct sg_table *sgt, unsigned long off) 55 + struct io_zcrx_area *area) 57 56 { 57 + unsigned niov_size = 1U << ifq->niov_shift; 58 + struct sg_table *sgt = area->mem.sgt; 58 59 struct scatterlist *sg; 59 60 unsigned i, niov_idx = 0; 60 61 61 62 for_each_sgtable_dma_sg(sgt, sg, i) { 62 63 dma_addr_t dma = sg_dma_address(sg); 63 64 unsigned long sg_len = sg_dma_len(sg); 64 - unsigned long sg_off = min(sg_len, off); 65 65 66 - off -= sg_off; 67 - sg_len -= sg_off; 68 - dma += sg_off; 66 + if (WARN_ON_ONCE(sg_len % niov_size)) 67 + return -EINVAL; 69 68 70 69 while (sg_len && niov_idx < area->nia.num_niovs) { 71 70 struct net_iov *niov = &area->nia.niovs[niov_idx]; 72 71 73 72 if (net_mp_niov_set_dma_addr(niov, dma)) 74 73 return -EFAULT; 75 - sg_len -= PAGE_SIZE; 76 - dma += PAGE_SIZE; 74 + sg_len -= niov_size; 75 + dma += niov_size; 77 76 niov_idx++; 78 77 } 79 78 } 79 + 80 + if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) 81 + return -EFAULT; 80 82 return 0; 81 83 } 82 84 ··· 150 144 goto err; 151 145 } 152 146 153 - mem->dmabuf_offset = off; 154 147 mem->size = len; 155 148 return 0; 156 149 err: 157 150 io_release_dmabuf(mem); 158 151 return ret; 159 - } 160 - 161 - static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 162 - { 163 - if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 164 - return -EINVAL; 165 - return io_populate_area_dma(ifq, area, area->mem.sgt, 166 - area->mem.dmabuf_offset); 167 152 } 168 153 169 154 static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) ··· 203 206 if (ret < 0) 204 207 mem->account_pages = 0; 205 208 209 + mem->sgt = &mem->page_sg_table; 206 210 mem->pages = pages; 207 211 mem->nr_folios = nr_pages; 208 212 mem->size = area_reg->len; ··· 218 220 } 219 221 if (mem->pages) { 220 222 unpin_user_pages(mem->pages, mem->nr_folios); 221 - sg_free_table(&mem->page_sg_table); 223 + sg_free_table(mem->sgt); 224 + mem->sgt = NULL; 222 225 kvfree(mem->pages); 223 226 } 224 227 } ··· 229 230 struct io_uring_zcrx_area_reg *area_reg) 230 231 { 231 232 int ret; 233 + 234 + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 235 + return -EINVAL; 236 + if (area_reg->rq_area_token) 237 + return -EINVAL; 238 + if (area_reg->__resv2[0] || area_reg->__resv2[1]) 239 + return -EINVAL; 232 240 233 241 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 234 242 if (ret) ··· 253 247 { 254 248 int i; 255 249 256 - guard(mutex)(&ifq->dma_lock); 250 + guard(mutex)(&ifq->pp_lock); 257 251 if (!area->is_mapped) 258 252 return; 259 253 area->is_mapped = false; ··· 269 263 } 270 264 } 271 265 272 - static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 273 - { 274 - int ret; 275 - 276 - ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, 277 - DMA_FROM_DEVICE, IO_DMA_ATTR); 278 - if (ret < 0) 279 - return ret; 280 - return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0); 281 - } 282 - 283 266 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 284 267 { 285 268 int ret; 286 269 287 - guard(mutex)(&ifq->dma_lock); 270 + guard(mutex)(&ifq->pp_lock); 288 271 if (area->is_mapped) 289 272 return 0; 290 273 291 - if (area->mem.is_dmabuf) 292 - ret = io_zcrx_map_area_dmabuf(ifq, area); 293 - else 294 - ret = io_zcrx_map_area_umem(ifq, area); 274 + if (!area->mem.is_dmabuf) { 275 + ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, 276 + DMA_FROM_DEVICE, IO_DMA_ATTR); 277 + if (ret < 0) 278 + return ret; 279 + } 295 280 281 + ret = io_populate_area_dma(ifq, area); 296 282 if (ret == 0) 297 283 area->is_mapped = true; 298 284 return ret; 299 285 } 300 286 301 - static void io_zcrx_sync_for_device(const struct page_pool *pool, 287 + static void io_zcrx_sync_for_device(struct page_pool *pool, 302 288 struct net_iov *niov) 303 289 { 304 290 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 305 291 dma_addr_t dma_addr; 306 292 293 + unsigned niov_size; 294 + 307 295 if (!dma_dev_need_sync(pool->p.dev)) 308 296 return; 309 297 298 + niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; 310 299 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 311 300 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 312 - PAGE_SIZE, pool->p.dma_dir); 301 + niov_size, pool->p.dma_dir); 313 302 #endif 314 303 } 315 304 ··· 353 352 void *ptr; 354 353 int ret; 355 354 356 - off = sizeof(struct io_uring); 355 + off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 357 356 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 358 357 if (size > rd->size) 359 358 return -EINVAL; ··· 368 367 ptr = io_region_get_ptr(&ifq->region); 369 368 ifq->rq_ring = (struct io_uring *)ptr; 370 369 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 370 + 371 + reg->offsets.head = offsetof(struct io_uring, head); 372 + reg->offsets.tail = offsetof(struct io_uring, tail); 373 + reg->offsets.rqes = off; 371 374 return 0; 372 375 } 373 376 ··· 396 391 kfree(area); 397 392 } 398 393 399 - #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 394 + static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, 395 + struct io_zcrx_area *area) 396 + { 397 + if (ifq->area) 398 + return -EINVAL; 399 + ifq->area = area; 400 + return 0; 401 + } 400 402 401 403 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 402 - struct io_zcrx_area **res, 403 404 struct io_uring_zcrx_area_reg *area_reg) 404 405 { 405 406 struct io_zcrx_area *area; 406 407 unsigned nr_iovs; 407 408 int i, ret; 408 - 409 - if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 410 - return -EINVAL; 411 - if (area_reg->rq_area_token) 412 - return -EINVAL; 413 - if (area_reg->__resv2[0] || area_reg->__resv2[1]) 414 - return -EINVAL; 415 409 416 410 ret = -ENOMEM; 417 411 area = kzalloc(sizeof(*area), GFP_KERNEL); ··· 422 418 if (ret) 423 419 goto err; 424 420 425 - nr_iovs = area->mem.size >> PAGE_SHIFT; 421 + ifq->niov_shift = PAGE_SHIFT; 422 + nr_iovs = area->mem.size >> ifq->niov_shift; 426 423 area->nia.num_niovs = nr_iovs; 427 424 428 425 ret = -ENOMEM; 429 426 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 430 - GFP_KERNEL | __GFP_ZERO); 427 + GFP_KERNEL_ACCOUNT | __GFP_ZERO); 431 428 if (!area->nia.niovs) 432 429 goto err; 433 430 434 431 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 435 - GFP_KERNEL | __GFP_ZERO); 432 + GFP_KERNEL_ACCOUNT | __GFP_ZERO); 436 433 if (!area->freelist) 437 434 goto err; 438 435 439 436 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 440 - GFP_KERNEL | __GFP_ZERO); 437 + GFP_KERNEL_ACCOUNT | __GFP_ZERO); 441 438 if (!area->user_refs) 442 439 goto err; 443 440 ··· 456 451 area->area_id = 0; 457 452 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 458 453 spin_lock_init(&area->freelist_lock); 459 - *res = area; 460 - return 0; 454 + 455 + ret = io_zcrx_append_area(ifq, area); 456 + if (!ret) 457 + return 0; 461 458 err: 462 459 if (area) 463 460 io_zcrx_free_area(area); ··· 476 469 477 470 ifq->if_rxq = -1; 478 471 ifq->ctx = ctx; 479 - spin_lock_init(&ifq->lock); 480 472 spin_lock_init(&ifq->rq_lock); 481 - mutex_init(&ifq->dma_lock); 473 + mutex_init(&ifq->pp_lock); 482 474 return ifq; 483 475 } 484 476 485 477 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 486 478 { 487 - spin_lock(&ifq->lock); 488 - if (ifq->netdev) { 489 - netdev_put(ifq->netdev, &ifq->netdev_tracker); 490 - ifq->netdev = NULL; 491 - } 492 - spin_unlock(&ifq->lock); 479 + guard(mutex)(&ifq->pp_lock); 480 + 481 + if (!ifq->netdev) 482 + return; 483 + netdev_put(ifq->netdev, &ifq->netdev_tracker); 484 + ifq->netdev = NULL; 493 485 } 494 486 495 487 static void io_close_queue(struct io_zcrx_ifq *ifq) ··· 503 497 if (ifq->if_rxq == -1) 504 498 return; 505 499 506 - spin_lock(&ifq->lock); 507 - netdev = ifq->netdev; 508 - netdev_tracker = ifq->netdev_tracker; 509 - ifq->netdev = NULL; 510 - spin_unlock(&ifq->lock); 500 + scoped_guard(mutex, &ifq->pp_lock) { 501 + netdev = ifq->netdev; 502 + netdev_tracker = ifq->netdev_tracker; 503 + ifq->netdev = NULL; 504 + } 511 505 512 506 if (netdev) { 513 507 net_mp_close_rxq(netdev, ifq->if_rxq, &p); ··· 519 513 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 520 514 { 521 515 io_close_queue(ifq); 522 - io_zcrx_drop_netdev(ifq); 523 516 524 517 if (ifq->area) 525 518 io_zcrx_free_area(ifq->area); ··· 526 521 put_device(ifq->dev); 527 522 528 523 io_free_rbuf_ring(ifq); 529 - mutex_destroy(&ifq->dma_lock); 524 + mutex_destroy(&ifq->pp_lock); 530 525 kfree(ifq); 531 526 } 532 527 ··· 559 554 return -EPERM; 560 555 561 556 /* mandatory io_uring features for zc rx */ 562 - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 563 - ctx->flags & IORING_SETUP_CQE32)) 557 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 558 + return -EINVAL; 559 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 564 560 return -EINVAL; 565 561 if (copy_from_user(&reg, arg, sizeof(reg))) 566 562 return -EFAULT; 567 563 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 568 564 return -EFAULT; 569 - if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) || 565 + if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || 570 566 reg.__resv2 || reg.zcrx_id) 571 567 return -EINVAL; 572 568 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) ··· 612 606 } 613 607 get_device(ifq->dev); 614 608 615 - ret = io_zcrx_create_area(ifq, &ifq->area, &area); 609 + ret = io_zcrx_create_area(ifq, &area); 616 610 if (ret) 617 611 goto err; 618 612 ··· 623 617 goto err; 624 618 ifq->if_rxq = reg.if_rxq; 625 619 626 - reg.offsets.rqes = sizeof(struct io_uring); 627 - reg.offsets.head = offsetof(struct io_uring, head); 628 - reg.offsets.tail = offsetof(struct io_uring, tail); 629 620 reg.zcrx_id = id; 630 621 631 622 scoped_guard(mutex, &ctx->mmap_lock) { ··· 750 747 return &ifq->rqes[idx]; 751 748 } 752 749 750 + static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, 751 + struct io_zcrx_ifq *ifq, 752 + struct net_iov **ret_niov) 753 + { 754 + unsigned niov_idx, area_idx; 755 + struct io_zcrx_area *area; 756 + 757 + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 758 + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; 759 + 760 + if (unlikely(rqe->__pad || area_idx)) 761 + return false; 762 + area = ifq->area; 763 + 764 + if (unlikely(niov_idx >= area->nia.num_niovs)) 765 + return false; 766 + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 767 + 768 + *ret_niov = &area->nia.niovs[niov_idx]; 769 + return true; 770 + } 771 + 753 772 static void io_zcrx_ring_refill(struct page_pool *pp, 754 773 struct io_zcrx_ifq *ifq) 755 774 { 756 775 unsigned int mask = ifq->rq_entries - 1; 757 776 unsigned int entries; 758 - netmem_ref netmem; 759 777 760 - spin_lock_bh(&ifq->rq_lock); 778 + guard(spinlock_bh)(&ifq->rq_lock); 761 779 762 780 entries = io_zcrx_rqring_entries(ifq); 763 - entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 764 - if (unlikely(!entries)) { 765 - spin_unlock_bh(&ifq->rq_lock); 781 + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); 782 + if (unlikely(!entries)) 766 783 return; 767 - } 768 784 769 785 do { 770 786 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 771 - struct io_zcrx_area *area; 772 787 struct net_iov *niov; 773 - unsigned niov_idx, area_idx; 788 + netmem_ref netmem; 774 789 775 - area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 776 - niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 777 - 778 - if (unlikely(rqe->__pad || area_idx)) 790 + if (!io_parse_rqe(rqe, ifq, &niov)) 779 791 continue; 780 - area = ifq->area; 781 - 782 - if (unlikely(niov_idx >= area->nia.num_niovs)) 783 - continue; 784 - niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 785 - 786 - niov = &area->nia.niovs[niov_idx]; 787 792 if (!io_zcrx_put_niov_uref(niov)) 788 793 continue; 789 794 790 795 netmem = net_iov_to_netmem(niov); 791 - if (page_pool_unref_netmem(netmem, 1) != 0) 796 + if (!page_pool_unref_and_test(netmem)) 792 797 continue; 793 798 794 799 if (unlikely(niov->pp != pp)) { ··· 809 798 } while (--entries); 810 799 811 800 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 812 - spin_unlock_bh(&ifq->rq_lock); 813 801 } 814 802 815 803 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) ··· 870 860 return -EINVAL; 871 861 if (WARN_ON_ONCE(!pp->dma_map)) 872 862 return -EOPNOTSUPP; 873 - if (pp->p.order != 0) 874 - return -EOPNOTSUPP; 863 + if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) 864 + return -EINVAL; 875 865 if (pp->p.dma_dir != DMA_FROM_DEVICE) 876 866 return -EOPNOTSUPP; 877 867 ··· 927 917 .uninstall = io_pp_uninstall, 928 918 }; 929 919 920 + #define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) 921 + #define IO_ZCRX_SYS_REFILL_BATCH 32 922 + 923 + static void io_return_buffers(struct io_zcrx_ifq *ifq, 924 + struct io_uring_zcrx_rqe *rqes, unsigned nr) 925 + { 926 + int i; 927 + 928 + for (i = 0; i < nr; i++) { 929 + struct net_iov *niov; 930 + netmem_ref netmem; 931 + 932 + if (!io_parse_rqe(&rqes[i], ifq, &niov)) 933 + continue; 934 + 935 + scoped_guard(spinlock_bh, &ifq->rq_lock) { 936 + if (!io_zcrx_put_niov_uref(niov)) 937 + continue; 938 + } 939 + 940 + netmem = net_iov_to_netmem(niov); 941 + if (!page_pool_unref_and_test(netmem)) 942 + continue; 943 + io_zcrx_return_niov(niov); 944 + } 945 + } 946 + 947 + int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 948 + void __user *arg, unsigned nr_arg) 949 + { 950 + struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; 951 + struct io_uring_zcrx_rqe __user *user_rqes; 952 + struct io_uring_zcrx_sync_refill zr; 953 + struct io_zcrx_ifq *ifq; 954 + unsigned nr, i; 955 + 956 + if (nr_arg) 957 + return -EINVAL; 958 + if (copy_from_user(&zr, arg, sizeof(zr))) 959 + return -EFAULT; 960 + if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) 961 + return -EINVAL; 962 + if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) 963 + return -EINVAL; 964 + 965 + ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); 966 + if (!ifq) 967 + return -EINVAL; 968 + nr = zr.nr_entries; 969 + user_rqes = u64_to_user_ptr(zr.rqes); 970 + 971 + for (i = 0; i < nr;) { 972 + unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); 973 + size_t size = batch * sizeof(rqes[0]); 974 + 975 + if (copy_from_user(rqes, user_rqes + i, size)) 976 + return i ? i : -EFAULT; 977 + io_return_buffers(ifq, rqes, batch); 978 + 979 + i += batch; 980 + 981 + if (fatal_signal_pending(current)) 982 + return i; 983 + cond_resched(); 984 + } 985 + return nr; 986 + } 987 + 930 988 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 931 989 struct io_zcrx_ifq *ifq, int off, int len) 932 990 { 991 + struct io_ring_ctx *ctx = req->ctx; 933 992 struct io_uring_zcrx_cqe *rcqe; 934 993 struct io_zcrx_area *area; 935 994 struct io_uring_cqe *cqe; 936 995 u64 offset; 937 996 938 - if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 997 + if (!io_defer_get_uncommited_cqe(ctx, &cqe)) 939 998 return false; 940 999 941 1000 cqe->user_data = req->cqe.user_data; 942 1001 cqe->res = len; 943 1002 cqe->flags = IORING_CQE_F_MORE; 1003 + if (ctx->flags & IORING_SETUP_CQE_MIXED) 1004 + cqe->flags |= IORING_CQE_F_32; 944 1005 945 1006 area = io_zcrx_iov_to_area(niov); 946 - offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 1007 + offset = off + (net_iov_idx(niov) << ifq->niov_shift); 947 1008 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 948 1009 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 949 1010 rcqe->__pad = 0; 950 1011 return true; 951 1012 } 952 1013 953 - static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 1014 + static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) 954 1015 { 1016 + struct io_zcrx_area *area = ifq->area; 955 1017 struct net_iov *niov = NULL; 1018 + 1019 + if (area->mem.is_dmabuf) 1020 + return NULL; 956 1021 957 1022 spin_lock_bh(&area->freelist_lock); 958 1023 if (area->free_count) ··· 1088 1003 struct page *src_page, unsigned int src_offset, 1089 1004 size_t len) 1090 1005 { 1091 - struct io_zcrx_area *area = ifq->area; 1092 1006 size_t copied = 0; 1093 1007 int ret = 0; 1094 - 1095 - if (area->mem.is_dmabuf) 1096 - return -EFAULT; 1097 1008 1098 1009 while (len) { 1099 1010 struct io_copy_cache cc; 1100 1011 struct net_iov *niov; 1101 1012 size_t n; 1102 1013 1103 - niov = io_zcrx_alloc_fallback(area); 1014 + niov = io_alloc_fallback_niov(ifq); 1104 1015 if (!niov) { 1105 1016 ret = -ENOMEM; 1106 1017 break;

+15 -4

io_uring/zcrx.h

··· 16 16 unsigned long nr_folios; 17 17 struct sg_table page_sg_table; 18 18 unsigned long account_pages; 19 + struct sg_table *sgt; 19 20 20 21 struct dma_buf_attachment *attach; 21 22 struct dma_buf *dmabuf; 22 - struct sg_table *sgt; 23 - unsigned long dmabuf_offset; 24 23 }; 25 24 26 25 struct io_zcrx_area { ··· 41 42 struct io_zcrx_ifq { 42 43 struct io_ring_ctx *ctx; 43 44 struct io_zcrx_area *area; 45 + unsigned niov_shift; 44 46 45 47 spinlock_t rq_lock ____cacheline_aligned_in_smp; 46 48 struct io_uring *rq_ring; ··· 53 53 struct device *dev; 54 54 struct net_device *netdev; 55 55 netdevice_tracker netdev_tracker; 56 - spinlock_t lock; 57 - struct mutex dma_lock; 56 + 57 + /* 58 + * Page pool and net configuration lock, can be taken deeper in the 59 + * net stack. 60 + */ 61 + struct mutex pp_lock; 58 62 struct io_mapped_region region; 59 63 }; 60 64 61 65 #if defined(CONFIG_IO_URING_ZCRX) 66 + int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 67 + void __user *arg, unsigned nr_arg); 62 68 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 63 69 struct io_uring_zcrx_ifq_reg __user *arg); 64 70 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); ··· 96 90 unsigned int id) 97 91 { 98 92 return NULL; 93 + } 94 + static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 95 + void __user *arg, unsigned nr_arg) 96 + { 97 + return -EOPNOTSUPP; 99 98 } 100 99 #endif 101 100

Configure Feed

Configure Feed