Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

- Store ring provided buffers locally for the users, rather than stuff
them into struct io_kiocb.

These types of buffers must always be fully consumed or recycled in
the current context, and leaving them in struct io_kiocb is hence not
a good ideas as that struct has a vastly different life time.

Basically just an architecture cleanup that can help prevent issues
with ring provided buffers in the future.

- Support for mixed CQE sizes in the same ring.

Before this change, a CQ ring either used the default 16b CQEs, or it
was setup with 32b CQE using IORING_SETUP_CQE32. For use cases where
a few 32b CQEs were needed, this caused everything else to use big
CQEs. This is wasteful both in terms of memory usage, but also memory
bandwidth for the posted CQEs.

With IORING_SETUP_CQE_MIXED, applications may use request types that
post both normal 16b and big 32b CQEs on the same ring.

- Add helpers for async data management, to make it harder for opcode
handlers to mess it up.

- Add support for multishot for uring_cmd, which ublk can use. This
helps improve efficiency, by providing a persistent request type that
can trigger multiple CQEs.

- Add initial support for ring feature querying.

We had basic support for probe operations, but the API isn't great.
Rather than expand that, add support for QUERY which is easily
expandable and can cover a lot more cases than the existing probe
support. This will help applications get a better idea of what
operations are supported on a given host.

- zcrx improvements from Pavel:
- Improve refill entry alignment for better caching
- Various cleanups, especially around deduplicating normal
memory vs dmabuf setup.
- Generalisation of the niov size (Patch 12). It's still hard
coded to PAGE_SIZE on init, but will let the user to specify
the rx buffer length on setup.
- Syscall / synchronous bufer return. It'll be used as a slow
fallback path for returning buffers when the refill queue is
full. Useful for tolerating slight queue size misconfiguration
or with inconsistent load.
- Accounting more memory to cgroups.
- Additional independent cleanups that will also be useful for
mutli-area support.

- Various fixes and cleanups

* tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits)
io_uring/cmd: drop unused res2 param from io_uring_cmd_done()
io_uring: fix nvme's 32b cqes on mixed cq
io_uring/query: cap number of queries
io_uring/query: prevent infinite loops
io_uring/zcrx: account niov arrays to cgroup
io_uring/zcrx: allow synchronous buffer return
io_uring/zcrx: introduce io_parse_rqe()
io_uring/zcrx: don't adjust free cache space
io_uring/zcrx: use guards for the refill lock
io_uring/zcrx: reduce netmem scope in refill
io_uring/zcrx: protect netdev with pp_lock
io_uring/zcrx: rename dma lock
io_uring/zcrx: make niov size variable
io_uring/zcrx: set sgt for umem area
io_uring/zcrx: remove dmabuf_offset
io_uring/zcrx: deduplicate area mapping
io_uring/zcrx: pass ifq to io_zcrx_alloc_fallback()
io_uring/zcrx: check all niovs filled with dma addresses
io_uring/zcrx: move area reg checks into io_import_area
io_uring/zcrx: don't pass slot to io_zcrx_create_area
...

+1001 -452
+1 -1
Documentation/networking/iou-zcrx.rst
··· 75 75 76 76 IORING_SETUP_SINGLE_ISSUER 77 77 IORING_SETUP_DEFER_TASKRUN 78 - IORING_SETUP_CQE32 78 + IORING_SETUP_CQE32 or IORING_SETUP_CQE_MIXED 79 79 80 80 Create memory area 81 81 ------------------
+1 -1
block/ioctl.c
··· 776 776 if (bic->res == -EAGAIN && bic->nowait) 777 777 io_uring_cmd_issue_blocking(cmd); 778 778 else 779 - io_uring_cmd_done(cmd, bic->res, 0, issue_flags); 779 + io_uring_cmd_done(cmd, bic->res, issue_flags); 780 780 } 781 781 782 782 static void bio_cmd_bio_end_io(struct bio *bio)
+3 -3
drivers/block/ublk_drv.c
··· 1189 1189 struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req); 1190 1190 1191 1191 /* tell ublksrv one io request is coming */ 1192 - io_uring_cmd_done(cmd, res, 0, issue_flags); 1192 + io_uring_cmd_done(cmd, res, issue_flags); 1193 1193 } 1194 1194 1195 1195 #define UBLK_REQUEUE_DELAY_MS 3 ··· 1873 1873 spin_unlock(&ubq->cancel_lock); 1874 1874 1875 1875 if (!done) 1876 - io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags); 1876 + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags); 1877 1877 } 1878 1878 1879 1879 /* ··· 2520 2520 int ret = ublk_ch_uring_cmd_local(cmd, issue_flags); 2521 2521 2522 2522 if (ret != -EIOCBQUEUED) 2523 - io_uring_cmd_done(cmd, ret, 0, issue_flags); 2523 + io_uring_cmd_done(cmd, ret, issue_flags); 2524 2524 } 2525 2525 2526 2526 static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+1 -1
drivers/nvme/host/ioctl.c
··· 410 410 411 411 if (pdu->bio) 412 412 blk_rq_unmap_user(pdu->bio); 413 - io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags); 413 + io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags); 414 414 } 415 415 416 416 static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
+1 -1
fs/btrfs/ioctl.c
··· 4695 4695 btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); 4696 4696 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); 4697 4697 4698 - io_uring_cmd_done(cmd, ret, 0, issue_flags); 4698 + io_uring_cmd_done(cmd, ret, issue_flags); 4699 4699 add_rchar(current, ret); 4700 4700 4701 4701 for (index = 0; index < priv->nr_pages; index++)
+4 -4
fs/fuse/dev_uring.c
··· 351 351 spin_unlock(&queue->lock); 352 352 353 353 if (cmd) 354 - io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); 354 + io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED); 355 355 356 356 if (req) 357 357 fuse_uring_stop_fuse_req_end(req); ··· 518 518 519 519 if (need_cmd_done) { 520 520 /* no queue lock to avoid lock order issues */ 521 - io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); 521 + io_uring_cmd_done(cmd, -ENOTCONN, issue_flags); 522 522 } 523 523 } 524 524 ··· 733 733 list_move_tail(&ent->list, &queue->ent_in_userspace); 734 734 spin_unlock(&queue->lock); 735 735 736 - io_uring_cmd_done(cmd, 0, 0, issue_flags); 736 + io_uring_cmd_done(cmd, 0, issue_flags); 737 737 return 0; 738 738 } 739 739 ··· 1200 1200 ent->cmd = NULL; 1201 1201 spin_unlock(&queue->lock); 1202 1202 1203 - io_uring_cmd_done(cmd, ret, 0, issue_flags); 1203 + io_uring_cmd_done(cmd, ret, issue_flags); 1204 1204 } 1205 1205 1206 1206 /*
+49 -20
include/linux/io_uring/cmd.h
··· 11 11 /* io_uring_cmd is being issued again */ 12 12 #define IORING_URING_CMD_REISSUE (1U << 31) 13 13 14 + typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, 15 + unsigned issue_flags); 16 + 14 17 struct io_uring_cmd { 15 18 struct file *file; 16 19 const struct io_uring_sqe *sqe; 17 20 /* callback to defer completions to task context */ 18 - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); 21 + io_uring_cmd_tw_t task_work_cb; 19 22 u32 cmd_op; 20 23 u32 flags; 21 24 u8 pdu[32]; /* available inline for free use */ ··· 56 53 * Note: the caller should never hard code @issue_flags and is only allowed 57 54 * to pass the mask provided by the core io_uring code. 58 55 */ 59 - void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2, 60 - unsigned issue_flags); 56 + void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, 57 + unsigned issue_flags, bool is_cqe32); 61 58 62 59 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 63 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 60 + io_uring_cmd_tw_t task_work_cb, 64 61 unsigned flags); 65 62 66 63 /* ··· 72 69 73 70 /* Execute the request from a blocking context */ 74 71 void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); 72 + 73 + /* 74 + * Select a buffer from the provided buffer group for multishot uring_cmd. 75 + * Returns the selected buffer address and size. 76 + */ 77 + struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, 78 + unsigned buf_group, size_t *len, 79 + unsigned int issue_flags); 80 + 81 + /* 82 + * Complete a multishot uring_cmd event. This will post a CQE to the completion 83 + * queue and update the provided buffer. 84 + */ 85 + bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, 86 + struct io_br_sel *sel, unsigned int issue_flags); 75 87 76 88 #else 77 89 static inline int ··· 104 86 { 105 87 return -EOPNOTSUPP; 106 88 } 107 - static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, 108 - u64 ret2, unsigned issue_flags) 89 + static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, 90 + u64 ret2, unsigned issue_flags, bool is_cqe32) 109 91 { 110 92 } 111 93 static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 112 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 113 - unsigned flags) 94 + io_uring_cmd_tw_t task_work_cb, unsigned flags) 114 95 { 115 96 } 116 97 static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, ··· 119 102 static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) 120 103 { 121 104 } 122 - #endif 123 - 124 - /* 125 - * Polled completions must ensure they are coming from a poll queue, and 126 - * hence are completed inside the usual poll handling loops. 127 - */ 128 - static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, 129 - ssize_t ret, ssize_t res2) 105 + static inline struct io_br_sel 106 + io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group, 107 + size_t *len, unsigned int issue_flags) 130 108 { 131 - lockdep_assert(in_task()); 132 - io_uring_cmd_done(ioucmd, ret, res2, 0); 109 + return (struct io_br_sel) { .val = -EOPNOTSUPP }; 133 110 } 111 + static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, 112 + struct io_br_sel *sel, unsigned int issue_flags) 113 + { 114 + return true; 115 + } 116 + #endif 134 117 135 118 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ 136 119 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, 137 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 120 + io_uring_cmd_tw_t task_work_cb) 138 121 { 139 122 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); 140 123 } 141 124 142 125 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, 143 - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) 126 + io_uring_cmd_tw_t task_work_cb) 144 127 { 145 128 __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); 146 129 } ··· 157 140 static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd) 158 141 { 159 142 return cmd_to_io_kiocb(cmd)->ctx; 143 + } 144 + 145 + static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, 146 + unsigned issue_flags) 147 + { 148 + return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false); 149 + } 150 + 151 + static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret, 152 + u64 res2, unsigned issue_flags) 153 + { 154 + return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true); 160 155 } 161 156 162 157 int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
+19 -12
include/linux/io_uring_types.h
··· 86 86 }; 87 87 88 88 /* 89 + * Return value from io_buffer_list selection, to avoid stashing it in 90 + * struct io_kiocb. For legacy/classic provided buffers, keeping a reference 91 + * across execution contexts are fine. But for ring provided buffers, the 92 + * list may go away as soon as ->uring_lock is dropped. As the io_kiocb 93 + * persists, it's better to just keep the buffer local for those cases. 94 + */ 95 + struct io_br_sel { 96 + struct io_buffer_list *buf_list; 97 + /* 98 + * Some selection parts return the user address, others return an error. 99 + */ 100 + union { 101 + void __user *addr; 102 + ssize_t val; 103 + }; 104 + }; 105 + 106 + 107 + /* 89 108 * Arbitrary limit, can be raised if need be 90 109 */ 91 110 #define IO_RINGFD_REG_MAX 16 ··· 690 671 /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ 691 672 struct io_buffer *kbuf; 692 673 693 - /* 694 - * stores buffer ID for ring provided buffers, valid IFF 695 - * REQ_F_BUFFER_RING is set. 696 - */ 697 - struct io_buffer_list *buf_list; 698 - 699 674 struct io_rsrc_node *buf_node; 700 675 }; 701 676 ··· 737 724 struct list_head list; 738 725 struct io_uring_cqe cqe; 739 726 }; 740 - 741 - static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx) 742 - { 743 - return ctx->flags & IORING_SETUP_CQE32; 744 - } 745 - 746 727 #endif
+3
include/linux/poison.h
··· 90 90 /********** lib/stackdepot.c **********/ 91 91 #define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA)) 92 92 93 + /********** io_uring/ **********/ 94 + #define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA)) 95 + 93 96 #endif
+2 -2
include/trace/events/io_uring.h
··· 340 340 __entry->user_data = cqe->user_data; 341 341 __entry->res = cqe->res; 342 342 __entry->cflags = cqe->flags; 343 - __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; 344 - __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; 343 + __entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0; 344 + __entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0; 345 345 ), 346 346 347 347 TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
+37 -1
include/uapi/linux/io_uring.h
··· 225 225 /* Use hybrid poll in iopoll process */ 226 226 #define IORING_SETUP_HYBRID_IOPOLL (1U << 17) 227 227 228 + /* 229 + * Allow both 16b and 32b CQEs. If a 32b CQE is posted, it will have 230 + * IORING_CQE_F_32 set in cqe->flags. 231 + */ 232 + #define IORING_SETUP_CQE_MIXED (1U << 18) 233 + 228 234 enum io_uring_op { 229 235 IORING_OP_NOP, 230 236 IORING_OP_READV, ··· 304 298 * sqe->uring_cmd_flags top 8bits aren't available for userspace 305 299 * IORING_URING_CMD_FIXED use registered buffer; pass this flag 306 300 * along with setting sqe->buf_index. 301 + * IORING_URING_CMD_MULTISHOT must be used with buffer select, like other 302 + * multishot commands. Not compatible with 303 + * IORING_URING_CMD_FIXED, for now. 307 304 */ 308 305 #define IORING_URING_CMD_FIXED (1U << 0) 309 - #define IORING_URING_CMD_MASK IORING_URING_CMD_FIXED 306 + #define IORING_URING_CMD_MULTISHOT (1U << 1) 307 + #define IORING_URING_CMD_MASK (IORING_URING_CMD_FIXED | IORING_URING_CMD_MULTISHOT) 310 308 311 309 312 310 /* ··· 464 454 #define IORING_NOP_FIXED_FILE (1U << 2) 465 455 #define IORING_NOP_FIXED_BUFFER (1U << 3) 466 456 #define IORING_NOP_TW (1U << 4) 457 + #define IORING_NOP_CQE32 (1U << 5) 467 458 468 459 /* 469 460 * IO completion data structure (Completion Queue Entry) ··· 498 487 * other provided buffer type, all completions with a 499 488 * buffer passed back is automatically returned to the 500 489 * application. 490 + * IORING_CQE_F_SKIP If set, then the application/liburing must ignore this 491 + * CQE. It's only purpose is to fill a gap in the ring, 492 + * if a large CQE is attempted posted when the ring has 493 + * just a single small CQE worth of space left before 494 + * wrapping. 495 + * IORING_CQE_F_32 If set, this is a 32b/big-cqe posting. Use with rings 496 + * setup in a mixed CQE mode, where both 16b and 32b 497 + * CQEs may be posted to the CQ ring. 501 498 */ 502 499 #define IORING_CQE_F_BUFFER (1U << 0) 503 500 #define IORING_CQE_F_MORE (1U << 1) 504 501 #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) 505 502 #define IORING_CQE_F_NOTIF (1U << 3) 506 503 #define IORING_CQE_F_BUF_MORE (1U << 4) 504 + #define IORING_CQE_F_SKIP (1U << 5) 505 + #define IORING_CQE_F_32 (1U << 15) 507 506 508 507 #define IORING_CQE_BUFFER_SHIFT 16 509 508 ··· 685 664 IORING_REGISTER_RESIZE_RINGS = 33, 686 665 687 666 IORING_REGISTER_MEM_REGION = 34, 667 + 668 + /* query various aspects of io_uring, see linux/io_uring/query.h */ 669 + IORING_REGISTER_QUERY = 35, 670 + 671 + /* return zcrx buffers back into circulation */ 672 + IORING_REGISTER_ZCRX_REFILL = 36, 688 673 689 674 /* this goes last */ 690 675 IORING_REGISTER_LAST, ··· 1071 1044 __u32 zcrx_id; 1072 1045 __u32 __resv2; 1073 1046 __u64 __resv[3]; 1047 + }; 1048 + 1049 + struct io_uring_zcrx_sync_refill { 1050 + __u32 zcrx_id; 1051 + /* the number of entries to return */ 1052 + __u32 nr_entries; 1053 + /* pointer to an array of struct io_uring_zcrx_rqe */ 1054 + __u64 rqes; 1055 + __u64 __resv[2]; 1074 1056 }; 1075 1057 1076 1058 #ifdef __cplusplus
+41
include/uapi/linux/io_uring/query.h
··· 1 + /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 + /* 3 + * Header file for the io_uring query interface. 4 + */ 5 + #ifndef LINUX_IO_URING_QUERY_H 6 + #define LINUX_IO_URING_QUERY_H 7 + 8 + #include <linux/types.h> 9 + 10 + struct io_uring_query_hdr { 11 + __u64 next_entry; 12 + __u64 query_data; 13 + __u32 query_op; 14 + __u32 size; 15 + __s32 result; 16 + __u32 __resv[3]; 17 + }; 18 + 19 + enum { 20 + IO_URING_QUERY_OPCODES = 0, 21 + 22 + __IO_URING_QUERY_MAX, 23 + }; 24 + 25 + /* Doesn't require a ring */ 26 + struct io_uring_query_opcode { 27 + /* The number of supported IORING_OP_* opcodes */ 28 + __u32 nr_request_opcodes; 29 + /* The number of supported IORING_[UN]REGISTER_* opcodes */ 30 + __u32 nr_register_opcodes; 31 + /* Bitmask of all supported IORING_FEAT_* flags */ 32 + __u64 feature_flags; 33 + /* Bitmask of all supported IORING_SETUP_* flags */ 34 + __u64 ring_setup_flags; 35 + /* Bitmask of all supported IORING_ENTER_** flags */ 36 + __u64 enter_flags; 37 + /* Bitmask of all supported IOSQE_* flags */ 38 + __u64 sqe_flags; 39 + }; 40 + 41 + #endif
+1 -1
io_uring/Makefile
··· 13 13 sync.o msg_ring.o advise.o openclose.o \ 14 14 statx.o timeout.o cancel.o \ 15 15 waitid.o register.o truncate.o \ 16 - memmap.o alloc_cache.o 16 + memmap.o alloc_cache.o query.o 17 17 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 18 18 obj-$(CONFIG_IO_WQ) += io-wq.o 19 19 obj-$(CONFIG_FUTEX) += futex.o
+1
io_uring/cancel.c
··· 11 11 12 12 #include <uapi/linux/io_uring.h> 13 13 14 + #include "filetable.h" 14 15 #include "io_uring.h" 15 16 #include "tctx.h" 16 17 #include "poll.h"
+2 -1
io_uring/cmd_net.c
··· 4 4 #include <net/sock.h> 5 5 6 6 #include "uring_cmd.h" 7 + #include "io_uring.h" 7 8 8 9 static inline int io_uring_cmd_getsockopt(struct socket *sock, 9 10 struct io_uring_cmd *cmd, ··· 74 73 75 74 cqe->user_data = 0; 76 75 cqe->res = tskey; 77 - cqe->flags = IORING_CQE_F_MORE; 76 + cqe->flags = IORING_CQE_F_MORE | ctx_cqe32_flags(cmd_to_io_kiocb(cmd)->ctx); 78 77 cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; 79 78 if (ret == SOF_TIMESTAMPING_TX_HARDWARE) 80 79 cqe->flags |= IORING_CQE_F_TSTAMP_HW;
+13 -11
io_uring/fdinfo.c
··· 9 9 10 10 #include <uapi/linux/io_uring.h> 11 11 12 - #include "io_uring.h" 12 + #include "filetable.h" 13 13 #include "sqpoll.h" 14 14 #include "fdinfo.h" 15 15 #include "cancel.h" ··· 65 65 unsigned int sq_tail = READ_ONCE(r->sq.tail); 66 66 unsigned int cq_head = READ_ONCE(r->cq.head); 67 67 unsigned int cq_tail = READ_ONCE(r->cq.tail); 68 - unsigned int cq_shift = 0; 69 68 unsigned int sq_shift = 0; 70 - unsigned int sq_entries, cq_entries; 69 + unsigned int sq_entries; 71 70 int sq_pid = -1, sq_cpu = -1; 72 71 u64 sq_total_time = 0, sq_work_time = 0; 73 72 unsigned int i; 74 73 75 - if (ctx->flags & IORING_SETUP_CQE32) 76 - cq_shift = 1; 77 74 if (ctx->flags & IORING_SETUP_SQE128) 78 75 sq_shift = 1; 79 76 ··· 122 125 seq_printf(m, "\n"); 123 126 } 124 127 seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); 125 - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); 126 - for (i = 0; i < cq_entries; i++) { 127 - unsigned int entry = i + cq_head; 128 - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; 128 + while (cq_head < cq_tail) { 129 + struct io_uring_cqe *cqe; 130 + bool cqe32 = false; 129 131 132 + cqe = &r->cqes[(cq_head & cq_mask)]; 133 + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) 134 + cqe32 = true; 130 135 seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", 131 - entry & cq_mask, cqe->user_data, cqe->res, 136 + cq_head & cq_mask, cqe->user_data, cqe->res, 132 137 cqe->flags); 133 - if (cq_shift) 138 + if (cqe32) 134 139 seq_printf(m, ", extra1:%llu, extra2:%llu\n", 135 140 cqe->big_cqe[0], cqe->big_cqe[1]); 136 141 seq_printf(m, "\n"); 142 + cq_head++; 143 + if (cqe32) 144 + cq_head++; 137 145 } 138 146 139 147 if (ctx->flags & IORING_SETUP_SQPOLL) {
+4 -9
io_uring/futex.c
··· 43 43 44 44 static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) 45 45 { 46 - req->async_data = NULL; 47 46 hlist_del_init(&req->hash_node); 48 47 io_req_task_complete(req, tw); 49 48 } ··· 53 54 54 55 io_tw_lock(ctx, tw); 55 56 io_cache_free(&ctx->futex_cache, req->async_data); 57 + io_req_async_data_clear(req, 0); 56 58 __io_futex_complete(req, tw); 57 59 } 58 60 ··· 72 72 io_req_set_res(req, res, 0); 73 73 } 74 74 75 - kfree(req->async_data); 76 - req->flags &= ~REQ_F_ASYNC_DATA; 75 + io_req_async_data_free(req); 77 76 __io_futex_complete(req, tw); 78 77 } 79 78 ··· 231 232 io_ring_submit_unlock(ctx, issue_flags); 232 233 req_set_fail(req); 233 234 io_req_set_res(req, ret, 0); 234 - kfree(futexv); 235 - req->async_data = NULL; 236 - req->flags &= ~REQ_F_ASYNC_DATA; 235 + io_req_async_data_free(req); 237 236 return IOU_COMPLETE; 238 237 } 239 238 ··· 307 310 if (ret < 0) 308 311 req_set_fail(req); 309 312 io_req_set_res(req, ret, 0); 310 - req->async_data = NULL; 311 - req->flags &= ~REQ_F_ASYNC_DATA; 312 - kfree(ifd); 313 + io_req_async_data_free(req); 313 314 return IOU_COMPLETE; 314 315 } 315 316
+96 -49
io_uring/io_uring.c
··· 79 79 80 80 #include "io-wq.h" 81 81 82 + #include "filetable.h" 82 83 #include "io_uring.h" 83 84 #include "opdef.h" 84 85 #include "refs.h" ··· 108 107 109 108 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 110 109 IOSQE_IO_HARDLINK | IOSQE_ASYNC) 111 - 112 - #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ 113 - IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) 114 110 115 111 #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 116 112 ··· 177 179 }; 178 180 #endif 179 181 182 + static void io_poison_cached_req(struct io_kiocb *req) 183 + { 184 + req->ctx = IO_URING_PTR_POISON; 185 + req->tctx = IO_URING_PTR_POISON; 186 + req->file = IO_URING_PTR_POISON; 187 + req->creds = IO_URING_PTR_POISON; 188 + req->io_task_work.func = IO_URING_PTR_POISON; 189 + req->apoll = IO_URING_PTR_POISON; 190 + } 191 + 192 + static void io_poison_req(struct io_kiocb *req) 193 + { 194 + io_poison_cached_req(req); 195 + req->async_data = IO_URING_PTR_POISON; 196 + req->kbuf = IO_URING_PTR_POISON; 197 + req->comp_list.next = IO_URING_PTR_POISON; 198 + req->file_node = IO_URING_PTR_POISON; 199 + req->link = IO_URING_PTR_POISON; 200 + } 201 + 180 202 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 181 203 { 182 204 return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); ··· 253 235 254 236 static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) 255 237 { 238 + if (IS_ENABLED(CONFIG_KASAN)) 239 + io_poison_cached_req(req); 256 240 wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); 257 241 } 258 242 ··· 614 594 615 595 static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) 616 596 { 617 - size_t cqe_size = sizeof(struct io_uring_cqe); 618 - 619 597 lockdep_assert_held(&ctx->uring_lock); 620 598 621 599 /* don't abort if we're dying, entries must get freed */ 622 600 if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) 623 601 return; 624 602 625 - if (ctx->flags & IORING_SETUP_CQE32) 626 - cqe_size <<= 1; 627 - 628 603 io_cq_lock(ctx); 629 604 while (!list_empty(&ctx->cq_overflow_list)) { 605 + size_t cqe_size = sizeof(struct io_uring_cqe); 630 606 struct io_uring_cqe *cqe; 631 607 struct io_overflow_cqe *ocqe; 608 + bool is_cqe32 = false; 632 609 633 610 ocqe = list_first_entry(&ctx->cq_overflow_list, 634 611 struct io_overflow_cqe, list); 612 + if (ocqe->cqe.flags & IORING_CQE_F_32 || 613 + ctx->flags & IORING_SETUP_CQE32) { 614 + is_cqe32 = true; 615 + cqe_size <<= 1; 616 + } 635 617 636 618 if (!dying) { 637 - if (!io_get_cqe_overflow(ctx, &cqe, true)) 619 + if (!io_get_cqe_overflow(ctx, &cqe, true, is_cqe32)) 638 620 break; 639 621 memcpy(cqe, &ocqe->cqe, cqe_size); 640 622 } ··· 748 726 { 749 727 struct io_overflow_cqe *ocqe; 750 728 size_t ocq_size = sizeof(struct io_overflow_cqe); 751 - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); 729 + bool is_cqe32 = false; 752 730 753 - if (is_cqe32) 731 + if (cqe->flags & IORING_CQE_F_32 || ctx->flags & IORING_SETUP_CQE32) { 732 + is_cqe32 = true; 754 733 ocq_size += sizeof(struct io_uring_cqe); 734 + } 755 735 756 736 ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); 757 737 trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); ··· 772 748 } 773 749 774 750 /* 751 + * Fill an empty dummy CQE, in case alignment is off for posting a 32b CQE 752 + * because the ring is a single 16b entry away from wrapping. 753 + */ 754 + static bool io_fill_nop_cqe(struct io_ring_ctx *ctx, unsigned int off) 755 + { 756 + if (__io_cqring_events(ctx) < ctx->cq_entries) { 757 + struct io_uring_cqe *cqe = &ctx->rings->cqes[off]; 758 + 759 + cqe->user_data = 0; 760 + cqe->res = 0; 761 + cqe->flags = IORING_CQE_F_SKIP; 762 + ctx->cached_cq_tail++; 763 + return true; 764 + } 765 + return false; 766 + } 767 + 768 + /* 775 769 * writes to the cq entry need to come after reading head; the 776 770 * control dependency is enough as we're using WRITE_ONCE to 777 771 * fill the cq entry 778 772 */ 779 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) 773 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32) 780 774 { 781 775 struct io_rings *rings = ctx->rings; 782 776 unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); ··· 808 766 if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) 809 767 return false; 810 768 769 + /* 770 + * Post dummy CQE if a 32b CQE is needed and there's only room for a 771 + * 16b CQE before the ring wraps. 772 + */ 773 + if (cqe32 && off + 1 == ctx->cq_entries) { 774 + if (!io_fill_nop_cqe(ctx, off)) 775 + return false; 776 + off = 0; 777 + } 778 + 811 779 /* userspace may cheat modifying the tail, be safe and do min */ 812 780 queued = min(__io_cqring_events(ctx), ctx->cq_entries); 813 781 free = ctx->cq_entries - queued; 814 782 /* we need a contiguous range, limit based on the current array offset */ 815 783 len = min(free, ctx->cq_entries - off); 816 - if (!len) 784 + if (len < (cqe32 + 1)) 817 785 return false; 818 786 819 787 if (ctx->flags & IORING_SETUP_CQE32) { ··· 841 789 { 842 790 struct io_uring_cqe *cqe; 843 791 844 - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) 792 + if (WARN_ON_ONCE(!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)))) 845 793 return false; 846 - if (unlikely(!io_get_cqe(ctx, &cqe))) 794 + if (unlikely(!io_get_cqe(ctx, &cqe, true))) 847 795 return false; 848 796 849 797 memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); ··· 854 802 static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, 855 803 u32 cflags) 856 804 { 805 + bool cqe32 = cflags & IORING_CQE_F_32; 857 806 struct io_uring_cqe *cqe; 858 807 859 - if (likely(io_get_cqe(ctx, &cqe))) { 808 + if (likely(io_get_cqe(ctx, &cqe, cqe32))) { 860 809 WRITE_ONCE(cqe->user_data, user_data); 861 810 WRITE_ONCE(cqe->res, res); 862 811 WRITE_ONCE(cqe->flags, cflags); 863 812 864 - if (ctx->flags & IORING_SETUP_CQE32) { 813 + if (cqe32) { 865 814 WRITE_ONCE(cqe->big_cqe[0], 0); 866 815 WRITE_ONCE(cqe->big_cqe[1], 0); 867 816 } ··· 1034 981 lockdep_assert_held(&req->ctx->uring_lock); 1035 982 1036 983 req_set_fail(req); 1037 - io_req_set_res(req, res, io_put_kbuf(req, res, IO_URING_F_UNLOCKED)); 984 + io_req_set_res(req, res, io_put_kbuf(req, res, NULL)); 1038 985 if (def->fail) 1039 986 def->fail(req); 1040 987 io_req_complete_defer(req); ··· 2054 2001 2055 2002 switch (io_arm_poll_handler(req, 0)) { 2056 2003 case IO_APOLL_READY: 2057 - io_kbuf_recycle(req, 0); 2058 2004 io_req_task_queue(req); 2059 2005 break; 2060 2006 case IO_APOLL_ABORTED: 2061 - io_kbuf_recycle(req, 0); 2062 2007 io_queue_iowq(req); 2063 2008 break; 2064 2009 case IO_APOLL_OK: ··· 2785 2734 if (check_shl_overflow(off, 1, &off)) 2786 2735 return SIZE_MAX; 2787 2736 } 2737 + if (flags & IORING_SETUP_CQE_MIXED) { 2738 + if (cq_entries < 2) 2739 + return SIZE_MAX; 2740 + } 2788 2741 2789 2742 #ifdef CONFIG_SMP 2790 2743 off = ALIGN(off, SMP_CACHE_BYTES); ··· 2820 2765 2821 2766 while (!io_req_cache_empty(ctx)) { 2822 2767 req = io_extract_req(ctx); 2768 + io_poison_req(req); 2823 2769 kmem_cache_free(req_cachep, req); 2824 2770 nr++; 2825 2771 } ··· 3101 3045 3102 3046 INIT_WORK(&ctx->exit_work, io_ring_exit_work); 3103 3047 /* 3104 - * Use system_unbound_wq to avoid spawning tons of event kworkers 3048 + * Use system_dfl_wq to avoid spawning tons of event kworkers 3105 3049 * if we're exiting a ton of rings at the same time. It just adds 3106 3050 * noise and overhead, there's no discernable change in runtime 3107 - * over using system_wq. 3051 + * over using system_percpu_wq. 3108 3052 */ 3109 3053 queue_work(iou_wq, &ctx->exit_work); 3110 3054 } ··· 3458 3402 struct file *file; 3459 3403 long ret; 3460 3404 3461 - if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | 3462 - IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | 3463 - IORING_ENTER_REGISTERED_RING | 3464 - IORING_ENTER_ABS_TIMER | 3465 - IORING_ENTER_EXT_ARG_REG | 3466 - IORING_ENTER_NO_IOWAIT))) 3405 + if (unlikely(flags & ~IORING_ENTER_FLAGS)) 3467 3406 return -EINVAL; 3468 3407 3469 3408 /* ··· 3708 3657 !(flags & IORING_SETUP_SINGLE_ISSUER)) 3709 3658 return -EINVAL; 3710 3659 3660 + /* 3661 + * Nonsensical to ask for CQE32 and mixed CQE support, it's not 3662 + * supported to post 16b CQEs on a ring setup with CQE32. 3663 + */ 3664 + if ((flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) == 3665 + (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED)) 3666 + return -EINVAL; 3667 + 3711 3668 return 0; 3712 3669 } 3713 3670 ··· 3866 3807 if (ret) 3867 3808 goto err; 3868 3809 3869 - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | 3870 - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | 3871 - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | 3872 - IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | 3873 - IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | 3874 - IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | 3875 - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | 3876 - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | 3877 - IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; 3810 + p->features = IORING_FEAT_FLAGS; 3878 3811 3879 3812 if (copy_to_user(params, p, sizeof(*p))) { 3880 3813 ret = -EFAULT; ··· 3874 3823 } 3875 3824 3876 3825 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER 3877 - && !(ctx->flags & IORING_SETUP_R_DISABLED)) 3878 - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 3826 + && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 3827 + /* 3828 + * Unlike io_register_enable_rings(), don't need WRITE_ONCE() 3829 + * since ctx isn't yet accessible from other tasks 3830 + */ 3831 + ctx->submitter_task = get_task_struct(current); 3832 + } 3879 3833 3880 3834 file = io_uring_get_file(ctx); 3881 3835 if (IS_ERR(file)) { ··· 3931 3875 return -EINVAL; 3932 3876 } 3933 3877 3934 - if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | 3935 - IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE | 3936 - IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | 3937 - IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | 3938 - IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | 3939 - IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | 3940 - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | 3941 - IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | 3942 - IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) 3878 + if (p.flags & ~IORING_SETUP_FLAGS) 3943 3879 return -EINVAL; 3944 - 3945 3880 return io_uring_create(entries, &p, params); 3946 3881 } 3947 3882
+105 -15
io_uring/io_uring.h
··· 11 11 #include "alloc_cache.h" 12 12 #include "io-wq.h" 13 13 #include "slist.h" 14 - #include "filetable.h" 15 14 #include "opdef.h" 16 15 17 16 #ifndef CREATE_TRACE_POINTS 18 17 #include <trace/events/io_uring.h> 19 18 #endif 19 + 20 + #define IORING_FEAT_FLAGS (IORING_FEAT_SINGLE_MMAP |\ 21 + IORING_FEAT_NODROP |\ 22 + IORING_FEAT_SUBMIT_STABLE |\ 23 + IORING_FEAT_RW_CUR_POS |\ 24 + IORING_FEAT_CUR_PERSONALITY |\ 25 + IORING_FEAT_FAST_POLL |\ 26 + IORING_FEAT_POLL_32BITS |\ 27 + IORING_FEAT_SQPOLL_NONFIXED |\ 28 + IORING_FEAT_EXT_ARG |\ 29 + IORING_FEAT_NATIVE_WORKERS |\ 30 + IORING_FEAT_RSRC_TAGS |\ 31 + IORING_FEAT_CQE_SKIP |\ 32 + IORING_FEAT_LINKED_FILE |\ 33 + IORING_FEAT_REG_REG_RING |\ 34 + IORING_FEAT_RECVSEND_BUNDLE |\ 35 + IORING_FEAT_MIN_TIMEOUT |\ 36 + IORING_FEAT_RW_ATTR |\ 37 + IORING_FEAT_NO_IOWAIT) 38 + 39 + #define IORING_SETUP_FLAGS (IORING_SETUP_IOPOLL |\ 40 + IORING_SETUP_SQPOLL |\ 41 + IORING_SETUP_SQ_AFF |\ 42 + IORING_SETUP_CQSIZE |\ 43 + IORING_SETUP_CLAMP |\ 44 + IORING_SETUP_ATTACH_WQ |\ 45 + IORING_SETUP_R_DISABLED |\ 46 + IORING_SETUP_SUBMIT_ALL |\ 47 + IORING_SETUP_COOP_TASKRUN |\ 48 + IORING_SETUP_TASKRUN_FLAG |\ 49 + IORING_SETUP_SQE128 |\ 50 + IORING_SETUP_CQE32 |\ 51 + IORING_SETUP_SINGLE_ISSUER |\ 52 + IORING_SETUP_DEFER_TASKRUN |\ 53 + IORING_SETUP_NO_MMAP |\ 54 + IORING_SETUP_REGISTERED_FD_ONLY |\ 55 + IORING_SETUP_NO_SQARRAY |\ 56 + IORING_SETUP_HYBRID_IOPOLL |\ 57 + IORING_SETUP_CQE_MIXED) 58 + 59 + #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ 60 + IORING_ENTER_SQ_WAKEUP |\ 61 + IORING_ENTER_SQ_WAIT |\ 62 + IORING_ENTER_EXT_ARG |\ 63 + IORING_ENTER_REGISTERED_RING |\ 64 + IORING_ENTER_ABS_TIMER |\ 65 + IORING_ENTER_EXT_ARG_REG |\ 66 + IORING_ENTER_NO_IOWAIT) 67 + 68 + 69 + #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE |\ 70 + IOSQE_IO_DRAIN |\ 71 + IOSQE_IO_LINK |\ 72 + IOSQE_IO_HARDLINK |\ 73 + IOSQE_ASYNC |\ 74 + IOSQE_BUFFER_SELECT |\ 75 + IOSQE_CQE_SKIP_SUCCESS) 20 76 21 77 enum { 22 78 IOU_COMPLETE = 0, ··· 131 75 unsigned long rings_size(unsigned int flags, unsigned int sq_entries, 132 76 unsigned int cq_entries, size_t *sq_offset); 133 77 int io_uring_fill_params(unsigned entries, struct io_uring_params *p); 134 - bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); 78 + bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 135 79 int io_run_task_work_sig(struct io_ring_ctx *ctx); 136 80 void io_req_defer_failed(struct io_kiocb *req, s32 res); 137 81 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 225 169 226 170 static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, 227 171 struct io_uring_cqe **ret, 228 - bool overflow) 172 + bool overflow, bool cqe32) 229 173 { 230 174 io_lockdep_assert_cq_locked(ctx); 231 175 232 - if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { 233 - if (unlikely(!io_cqe_cache_refill(ctx, overflow))) 176 + if (unlikely(ctx->cqe_sentinel - ctx->cqe_cached < (cqe32 + 1))) { 177 + if (unlikely(!io_cqe_cache_refill(ctx, overflow, cqe32))) 234 178 return false; 235 179 } 236 180 *ret = ctx->cqe_cached; 237 181 ctx->cached_cq_tail++; 238 182 ctx->cqe_cached++; 239 - if (ctx->flags & IORING_SETUP_CQE32) 183 + if (ctx->flags & IORING_SETUP_CQE32) { 240 184 ctx->cqe_cached++; 185 + } else if (cqe32 && ctx->flags & IORING_SETUP_CQE_MIXED) { 186 + ctx->cqe_cached++; 187 + ctx->cached_cq_tail++; 188 + } 189 + WARN_ON_ONCE(ctx->cqe_cached > ctx->cqe_sentinel); 241 190 return true; 242 191 } 243 192 244 - static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) 193 + static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret, 194 + bool cqe32) 245 195 { 246 - return io_get_cqe_overflow(ctx, ret, false); 196 + return io_get_cqe_overflow(ctx, ret, false, cqe32); 247 197 } 248 198 249 199 static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, ··· 258 196 io_lockdep_assert_cq_locked(ctx); 259 197 260 198 ctx->submit_state.cq_flush = true; 261 - return io_get_cqe(ctx, cqe_ret); 199 + return io_get_cqe(ctx, cqe_ret, ctx->flags & IORING_SETUP_CQE_MIXED); 262 200 } 263 201 264 202 static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, 265 203 struct io_kiocb *req) 266 204 { 205 + bool is_cqe32 = req->cqe.flags & IORING_CQE_F_32; 267 206 struct io_uring_cqe *cqe; 268 207 269 208 /* 270 - * If we can't get a cq entry, userspace overflowed the 271 - * submission (by quite a lot). Increment the overflow count in 272 - * the ring. 209 + * If we can't get a cq entry, userspace overflowed the submission 210 + * (by quite a lot). 273 211 */ 274 - if (unlikely(!io_get_cqe(ctx, &cqe))) 212 + if (unlikely(!io_get_cqe(ctx, &cqe, is_cqe32))) 275 213 return false; 276 214 277 - 278 215 memcpy(cqe, &req->cqe, sizeof(*cqe)); 279 - if (ctx->flags & IORING_SETUP_CQE32) { 216 + if (ctx->flags & IORING_SETUP_CQE32 || is_cqe32) { 280 217 memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); 281 218 memset(&req->big_cqe, 0, sizeof(req->big_cqe)); 282 219 } ··· 300 239 req->cqe.flags = cflags; 301 240 } 302 241 242 + static inline u32 ctx_cqe32_flags(struct io_ring_ctx *ctx) 243 + { 244 + if (ctx->flags & IORING_SETUP_CQE_MIXED) 245 + return IORING_CQE_F_32; 246 + return 0; 247 + } 248 + 249 + static inline void io_req_set_res32(struct io_kiocb *req, s32 res, u32 cflags, 250 + __u64 extra1, __u64 extra2) 251 + { 252 + req->cqe.res = res; 253 + req->cqe.flags = cflags | ctx_cqe32_flags(req->ctx); 254 + req->big_cqe.extra1 = extra1; 255 + req->big_cqe.extra2 = extra2; 256 + } 257 + 303 258 static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, 304 259 struct io_kiocb *req) 305 260 { ··· 335 258 static inline bool req_has_async_data(struct io_kiocb *req) 336 259 { 337 260 return req->flags & REQ_F_ASYNC_DATA; 261 + } 262 + 263 + static inline void io_req_async_data_clear(struct io_kiocb *req, 264 + io_req_flags_t extra_flags) 265 + { 266 + req->flags &= ~(REQ_F_ASYNC_DATA|extra_flags); 267 + req->async_data = NULL; 268 + } 269 + 270 + static inline void io_req_async_data_free(struct io_kiocb *req) 271 + { 272 + kfree(req->async_data); 273 + io_req_async_data_clear(req, 0); 338 274 } 339 275 340 276 static inline void io_put_file(struct io_kiocb *req)
+36 -31
io_uring/kbuf.c
··· 155 155 return 1; 156 156 } 157 157 158 - static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, 159 - struct io_buffer_list *bl, 160 - unsigned int issue_flags) 158 + static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len, 159 + struct io_buffer_list *bl, 160 + unsigned int issue_flags) 161 161 { 162 162 struct io_uring_buf_ring *br = bl->buf_ring; 163 163 __u16 tail, head = bl->head; 164 + struct io_br_sel sel = { }; 164 165 struct io_uring_buf *buf; 165 - void __user *ret; 166 166 u32 buf_len; 167 167 168 168 tail = smp_load_acquire(&br->tail); 169 169 if (unlikely(tail == head)) 170 - return NULL; 170 + return sel; 171 171 172 172 if (head + 1 == tail) 173 173 req->flags |= REQ_F_BL_EMPTY; ··· 177 177 if (*len == 0 || *len > buf_len) 178 178 *len = buf_len; 179 179 req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; 180 - req->buf_list = bl; 181 180 req->buf_index = buf->bid; 182 - ret = u64_to_user_ptr(buf->addr); 181 + sel.buf_list = bl; 182 + sel.addr = u64_to_user_ptr(buf->addr); 183 183 184 184 if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { 185 185 /* ··· 192 192 * the transfer completes (or if we get -EAGAIN and must poll of 193 193 * retry). 194 194 */ 195 - io_kbuf_commit(req, bl, *len, 1); 196 - req->buf_list = NULL; 195 + io_kbuf_commit(req, sel.buf_list, *len, 1); 196 + sel.buf_list = NULL; 197 197 } 198 - return ret; 198 + return sel; 199 199 } 200 200 201 - void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 202 - unsigned buf_group, unsigned int issue_flags) 201 + struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, 202 + unsigned buf_group, unsigned int issue_flags) 203 203 { 204 204 struct io_ring_ctx *ctx = req->ctx; 205 + struct io_br_sel sel = { }; 205 206 struct io_buffer_list *bl; 206 - void __user *ret = NULL; 207 207 208 208 io_ring_submit_lock(req->ctx, issue_flags); 209 209 210 210 bl = io_buffer_get_list(ctx, buf_group); 211 211 if (likely(bl)) { 212 212 if (bl->flags & IOBL_BUF_RING) 213 - ret = io_ring_buffer_select(req, len, bl, issue_flags); 213 + sel = io_ring_buffer_select(req, len, bl, issue_flags); 214 214 else 215 - ret = io_provided_buffer_select(req, len, bl); 215 + sel.addr = io_provided_buffer_select(req, len, bl); 216 216 } 217 217 io_ring_submit_unlock(req->ctx, issue_flags); 218 - return ret; 218 + return sel; 219 219 } 220 220 221 221 /* cap it at a reasonable 256, will be one page even for 4K */ ··· 300 300 req->flags |= REQ_F_BL_EMPTY; 301 301 302 302 req->flags |= REQ_F_BUFFER_RING; 303 - req->buf_list = bl; 304 303 return iov - arg->iovs; 305 304 } 306 305 307 306 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, 308 - unsigned int issue_flags) 307 + struct io_br_sel *sel, unsigned int issue_flags) 309 308 { 310 309 struct io_ring_ctx *ctx = req->ctx; 311 - struct io_buffer_list *bl; 312 310 int ret = -ENOENT; 313 311 314 312 io_ring_submit_lock(ctx, issue_flags); 315 - bl = io_buffer_get_list(ctx, arg->buf_group); 316 - if (unlikely(!bl)) 313 + sel->buf_list = io_buffer_get_list(ctx, arg->buf_group); 314 + if (unlikely(!sel->buf_list)) 317 315 goto out_unlock; 318 316 319 - if (bl->flags & IOBL_BUF_RING) { 320 - ret = io_ring_buffers_peek(req, arg, bl); 317 + if (sel->buf_list->flags & IOBL_BUF_RING) { 318 + ret = io_ring_buffers_peek(req, arg, sel->buf_list); 321 319 /* 322 320 * Don't recycle these buffers if we need to go through poll. 323 321 * Nobody else can use them anyway, and holding on to provided ··· 325 327 */ 326 328 if (ret > 0) { 327 329 req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE; 328 - io_kbuf_commit(req, bl, arg->out_len, ret); 330 + io_kbuf_commit(req, sel->buf_list, arg->out_len, ret); 329 331 } 330 332 } else { 331 - ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); 333 + ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs); 332 334 } 333 335 out_unlock: 334 - io_ring_submit_unlock(ctx, issue_flags); 336 + if (issue_flags & IO_URING_F_UNLOCKED) { 337 + sel->buf_list = NULL; 338 + mutex_unlock(&ctx->uring_lock); 339 + } 335 340 return ret; 336 341 } 337 342 338 - int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) 343 + int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, 344 + struct io_br_sel *sel) 339 345 { 340 346 struct io_ring_ctx *ctx = req->ctx; 341 347 struct io_buffer_list *bl; ··· 355 353 ret = io_ring_buffers_peek(req, arg, bl); 356 354 if (ret > 0) 357 355 req->flags |= REQ_F_BUFFERS_COMMIT; 356 + sel->buf_list = bl; 358 357 return ret; 359 358 } 360 359 361 360 /* don't support multiple buffer selections for legacy */ 361 + sel->buf_list = NULL; 362 362 return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); 363 363 } 364 364 365 - static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) 365 + static inline bool __io_put_kbuf_ring(struct io_kiocb *req, 366 + struct io_buffer_list *bl, int len, int nr) 366 367 { 367 - struct io_buffer_list *bl = req->buf_list; 368 368 bool ret = true; 369 369 370 370 if (bl) ··· 376 372 return ret; 377 373 } 378 374 379 - unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) 375 + unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, 376 + int len, int nbufs) 380 377 { 381 378 unsigned int ret; 382 379 ··· 388 383 return ret; 389 384 } 390 385 391 - if (!__io_put_kbuf_ring(req, len, nbufs)) 386 + if (!__io_put_kbuf_ring(req, bl, len, nbufs)) 392 387 ret |= IORING_CQE_F_BUF_MORE; 393 388 return ret; 394 389 }
+18 -21
io_uring/kbuf.h
··· 62 62 unsigned short partial_map; 63 63 }; 64 64 65 - void __user *io_buffer_select(struct io_kiocb *req, size_t *len, 66 - unsigned buf_group, unsigned int issue_flags); 65 + struct io_br_sel io_buffer_select(struct io_kiocb *req, size_t *len, 66 + unsigned buf_group, unsigned int issue_flags); 67 67 int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, 68 - unsigned int issue_flags); 69 - int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); 68 + struct io_br_sel *sel, unsigned int issue_flags); 69 + int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, 70 + struct io_br_sel *sel); 70 71 void io_destroy_buffers(struct io_ring_ctx *ctx); 71 72 72 73 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); ··· 81 80 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); 82 81 void io_kbuf_drop_legacy(struct io_kiocb *req); 83 82 84 - unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); 83 + unsigned int __io_put_kbufs(struct io_kiocb *req, struct io_buffer_list *bl, 84 + int len, int nbufs); 85 85 bool io_kbuf_commit(struct io_kiocb *req, 86 86 struct io_buffer_list *bl, int len, int nr); 87 87 88 88 struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, 89 89 unsigned int bgid); 90 90 91 - static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) 91 + static inline bool io_kbuf_recycle_ring(struct io_kiocb *req, 92 + struct io_buffer_list *bl) 92 93 { 93 - /* 94 - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear 95 - * the flag and hence ensure that bl->head doesn't get incremented. 96 - * If the tail has already been incremented, hang on to it. 97 - * The exception is partial io, that case we should increment bl->head 98 - * to monopolize the buffer. 99 - */ 100 - if (req->buf_list) { 94 + if (bl) { 101 95 req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); 102 96 return true; 103 97 } ··· 106 110 return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); 107 111 } 108 112 109 - static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) 113 + static inline bool io_kbuf_recycle(struct io_kiocb *req, struct io_buffer_list *bl, 114 + unsigned issue_flags) 110 115 { 111 116 if (req->flags & REQ_F_BL_NO_RECYCLE) 112 117 return false; 118 + if (req->flags & REQ_F_BUFFER_RING) 119 + return io_kbuf_recycle_ring(req, bl); 113 120 if (req->flags & REQ_F_BUFFER_SELECTED) 114 121 return io_kbuf_recycle_legacy(req, issue_flags); 115 - if (req->flags & REQ_F_BUFFER_RING) 116 - return io_kbuf_recycle_ring(req); 117 122 return false; 118 123 } 119 124 120 125 static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, 121 - unsigned issue_flags) 126 + struct io_buffer_list *bl) 122 127 { 123 128 if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 124 129 return 0; 125 - return __io_put_kbufs(req, len, 1); 130 + return __io_put_kbufs(req, bl, len, 1); 126 131 } 127 132 128 133 static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, 129 - int nbufs, unsigned issue_flags) 134 + struct io_buffer_list *bl, int nbufs) 130 135 { 131 136 if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) 132 137 return 0; 133 - return __io_put_kbufs(req, len, nbufs); 138 + return __io_put_kbufs(req, bl, len, nbufs); 134 139 } 135 140 #endif
+75 -85
io_uring/net.c
··· 10 10 11 11 #include <uapi/linux/io_uring.h> 12 12 13 + #include "filetable.h" 13 14 #include "io_uring.h" 14 15 #include "kbuf.h" 15 16 #include "alloc_cache.h" ··· 179 178 if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) 180 179 io_vec_free(&hdr->vec); 181 180 182 - if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { 183 - req->async_data = NULL; 184 - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 185 - } 181 + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) 182 + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); 186 183 } 187 184 188 185 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) ··· 432 433 if (req->opcode == IORING_OP_SENDMSG) 433 434 return -EINVAL; 434 435 sr->msg_flags |= MSG_WAITALL; 435 - req->buf_list = NULL; 436 436 req->flags |= REQ_F_MULTISHOT; 437 437 } 438 438 ··· 492 494 return nbufs; 493 495 } 494 496 495 - static int io_net_kbuf_recyle(struct io_kiocb *req, 497 + static int io_net_kbuf_recyle(struct io_kiocb *req, struct io_buffer_list *bl, 496 498 struct io_async_msghdr *kmsg, int len) 497 499 { 498 500 req->flags |= REQ_F_BL_NO_RECYCLE; 499 501 if (req->flags & REQ_F_BUFFERS_COMMIT) 500 - io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len)); 502 + io_kbuf_commit(req, bl, len, io_bundle_nbufs(kmsg, len)); 501 503 return IOU_RETRY; 502 504 } 503 505 504 - static inline bool io_send_finish(struct io_kiocb *req, int *ret, 506 + static inline bool io_send_finish(struct io_kiocb *req, 505 507 struct io_async_msghdr *kmsg, 506 - unsigned issue_flags) 508 + struct io_br_sel *sel) 507 509 { 508 510 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 509 - bool bundle_finished = *ret <= 0; 511 + bool bundle_finished = sel->val <= 0; 510 512 unsigned int cflags; 511 513 512 514 if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { 513 - cflags = io_put_kbuf(req, *ret, issue_flags); 515 + cflags = io_put_kbuf(req, sel->val, sel->buf_list); 514 516 goto finish; 515 517 } 516 518 517 - cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags); 519 + cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); 518 520 519 521 if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 520 522 goto finish; ··· 523 525 * Fill CQE for this receive and see if we should keep trying to 524 526 * receive from this socket. 525 527 */ 526 - if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 528 + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { 527 529 io_mshot_prep_retry(req, kmsg); 528 530 return false; 529 531 } 530 532 531 533 /* Otherwise stop bundle and use the current result. */ 532 534 finish: 533 - io_req_set_res(req, *ret, cflags); 534 - *ret = IOU_COMPLETE; 535 + io_req_set_res(req, sel->val, cflags); 536 + sel->val = IOU_COMPLETE; 535 537 return true; 536 538 } 537 539 ··· 569 571 kmsg->msg.msg_controllen = 0; 570 572 kmsg->msg.msg_control = NULL; 571 573 sr->done_io += ret; 572 - return io_net_kbuf_recyle(req, kmsg, ret); 574 + return -EAGAIN; 573 575 } 574 576 if (ret == -ERESTARTSYS) 575 577 ret = -EINTR; ··· 585 587 } 586 588 587 589 static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, 588 - struct io_async_msghdr *kmsg) 590 + struct io_br_sel *sel, struct io_async_msghdr *kmsg) 589 591 { 590 592 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 591 - 592 - int ret; 593 593 struct buf_sel_arg arg = { 594 594 .iovs = &kmsg->fast_iov, 595 595 .max_len = min_not_zero(sr->len, INT_MAX), 596 596 .nr_iovs = 1, 597 597 .buf_group = sr->buf_group, 598 598 }; 599 + int ret; 599 600 600 601 if (kmsg->vec.iovec) { 601 602 arg.nr_iovs = kmsg->vec.nr; ··· 607 610 else 608 611 arg.mode |= KBUF_MODE_EXPAND; 609 612 610 - ret = io_buffers_select(req, &arg, issue_flags); 613 + ret = io_buffers_select(req, &arg, sel, issue_flags); 611 614 if (unlikely(ret < 0)) 612 615 return ret; 613 616 ··· 636 639 { 637 640 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 638 641 struct io_async_msghdr *kmsg = req->async_data; 642 + struct io_br_sel sel = { }; 639 643 struct socket *sock; 640 644 unsigned flags; 641 645 int min_ret = 0; ··· 655 657 flags |= MSG_DONTWAIT; 656 658 657 659 retry_bundle: 660 + sel.buf_list = NULL; 658 661 if (io_do_buffer_select(req)) { 659 - ret = io_send_select_buffer(req, issue_flags, kmsg); 662 + ret = io_send_select_buffer(req, issue_flags, &sel, kmsg); 660 663 if (ret) 661 664 return ret; 662 665 } ··· 681 682 sr->len -= ret; 682 683 sr->buf += ret; 683 684 sr->done_io += ret; 684 - return io_net_kbuf_recyle(req, kmsg, ret); 685 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 685 686 } 686 687 if (ret == -ERESTARTSYS) 687 688 ret = -EINTR; ··· 692 693 else if (sr->done_io) 693 694 ret = sr->done_io; 694 695 695 - if (!io_send_finish(req, &ret, kmsg, issue_flags)) 696 + sel.val = ret; 697 + if (!io_send_finish(req, kmsg, &sel)) 696 698 goto retry_bundle; 697 699 698 700 io_req_msg_cleanup(req, issue_flags); 699 - return ret; 701 + return sel.val; 700 702 } 701 703 702 704 static int io_recvmsg_mshot_prep(struct io_kiocb *req, ··· 794 794 req->flags |= REQ_F_NOWAIT; 795 795 if (sr->msg_flags & MSG_ERRQUEUE) 796 796 req->flags |= REQ_F_CLEAR_POLLIN; 797 - if (req->flags & REQ_F_BUFFER_SELECT) { 798 - /* 799 - * Store the buffer group for this multishot receive separately, 800 - * as if we end up doing an io-wq based issue that selects a 801 - * buffer, it has to be committed immediately and that will 802 - * clear ->buf_list. This means we lose the link to the buffer 803 - * list, and the eventual buffer put on completion then cannot 804 - * restore it. 805 - */ 797 + if (req->flags & REQ_F_BUFFER_SELECT) 806 798 sr->buf_group = req->buf_index; 807 - req->buf_list = NULL; 808 - } 809 799 sr->mshot_total_len = sr->mshot_len = 0; 810 800 if (sr->flags & IORING_RECV_MULTISHOT) { 811 801 if (!(req->flags & REQ_F_BUFFER_SELECT)) ··· 836 846 * Returns true if it is actually finished, or false if it should run 837 847 * again (for multishot). 838 848 */ 839 - static inline bool io_recv_finish(struct io_kiocb *req, int *ret, 849 + static inline bool io_recv_finish(struct io_kiocb *req, 840 850 struct io_async_msghdr *kmsg, 841 - bool mshot_finished, unsigned issue_flags) 851 + struct io_br_sel *sel, bool mshot_finished, 852 + unsigned issue_flags) 842 853 { 843 854 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 844 855 unsigned int cflags = 0; ··· 847 856 if (kmsg->msg.msg_inq > 0) 848 857 cflags |= IORING_CQE_F_SOCK_NONEMPTY; 849 858 850 - if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { 859 + if (sel->val > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { 851 860 /* 852 861 * If sr->len hits zero, the limit has been reached. Mark 853 862 * mshot as finished, and flag MSHOT_DONE as well to prevent 854 863 * a potential bundle from being retried. 855 864 */ 856 - sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len); 865 + sr->mshot_total_len -= min_t(int, sel->val, sr->mshot_total_len); 857 866 if (!sr->mshot_total_len) { 858 867 sr->flags |= IORING_RECV_MSHOT_DONE; 859 868 mshot_finished = true; ··· 861 870 } 862 871 863 872 if (sr->flags & IORING_RECVSEND_BUNDLE) { 864 - size_t this_ret = *ret - sr->done_io; 873 + size_t this_ret = sel->val - sr->done_io; 865 874 866 - cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), 867 - issue_flags); 875 + cflags |= io_put_kbufs(req, this_ret, sel->buf_list, io_bundle_nbufs(kmsg, this_ret)); 868 876 if (sr->flags & IORING_RECV_RETRY) 869 877 cflags = req->cqe.flags | (cflags & CQE_F_MASK); 870 - if (sr->mshot_len && *ret >= sr->mshot_len) 878 + if (sr->mshot_len && sel->val >= sr->mshot_len) 871 879 sr->flags |= IORING_RECV_MSHOT_CAP; 872 880 /* bundle with no more immediate buffers, we're done */ 873 881 if (req->flags & REQ_F_BL_EMPTY) ··· 885 895 return false; 886 896 } 887 897 } else { 888 - cflags |= io_put_kbuf(req, *ret, issue_flags); 898 + cflags |= io_put_kbuf(req, sel->val, sel->buf_list); 889 899 } 890 900 891 901 /* ··· 893 903 * receive from this socket. 894 904 */ 895 905 if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && 896 - io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { 897 - *ret = IOU_RETRY; 906 + io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) { 907 + sel->val = IOU_RETRY; 898 908 io_mshot_prep_retry(req, kmsg); 899 909 /* Known not-empty or unknown state, retry */ 900 910 if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { ··· 906 916 sr->nr_multishot_loops = 0; 907 917 sr->flags &= ~IORING_RECV_MSHOT_CAP; 908 918 if (issue_flags & IO_URING_F_MULTISHOT) 909 - *ret = IOU_REQUEUE; 919 + sel->val = IOU_REQUEUE; 910 920 } 911 921 return true; 912 922 } 913 923 914 924 /* Finish the request / stop multishot. */ 915 925 finish: 916 - io_req_set_res(req, *ret, cflags); 917 - *ret = IOU_COMPLETE; 926 + io_req_set_res(req, sel->val, cflags); 927 + sel->val = IOU_COMPLETE; 918 928 io_req_msg_cleanup(req, issue_flags); 919 929 return true; 920 930 } ··· 1007 1017 { 1008 1018 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1009 1019 struct io_async_msghdr *kmsg = req->async_data; 1020 + struct io_br_sel sel = { }; 1010 1021 struct socket *sock; 1011 1022 unsigned flags; 1012 1023 int ret, min_ret = 0; ··· 1027 1036 flags |= MSG_DONTWAIT; 1028 1037 1029 1038 retry_multishot: 1039 + sel.buf_list = NULL; 1030 1040 if (io_do_buffer_select(req)) { 1031 - void __user *buf; 1032 1041 size_t len = sr->len; 1033 1042 1034 - buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); 1035 - if (!buf) 1043 + sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); 1044 + if (!sel.addr) 1036 1045 return -ENOBUFS; 1037 1046 1038 1047 if (req->flags & REQ_F_APOLL_MULTISHOT) { 1039 - ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); 1048 + ret = io_recvmsg_prep_multishot(kmsg, sr, &sel.addr, &len); 1040 1049 if (ret) { 1041 - io_kbuf_recycle(req, issue_flags); 1050 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1042 1051 return ret; 1043 1052 } 1044 1053 } 1045 1054 1046 - iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, buf, len); 1055 + iov_iter_ubuf(&kmsg->msg.msg_iter, ITER_DEST, sel.addr, len); 1047 1056 } 1048 1057 1049 1058 kmsg->msg.msg_get_inq = 1; ··· 1062 1071 1063 1072 if (ret < min_ret) { 1064 1073 if (ret == -EAGAIN && force_nonblock) { 1065 - if (issue_flags & IO_URING_F_MULTISHOT) 1066 - io_kbuf_recycle(req, issue_flags); 1067 - 1074 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1068 1075 return IOU_RETRY; 1069 1076 } 1070 1077 if (ret > 0 && io_net_retry(sock, flags)) { 1071 1078 sr->done_io += ret; 1072 - return io_net_kbuf_recyle(req, kmsg, ret); 1079 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 1073 1080 } 1074 1081 if (ret == -ERESTARTSYS) 1075 1082 ret = -EINTR; ··· 1081 1092 else if (sr->done_io) 1082 1093 ret = sr->done_io; 1083 1094 else 1084 - io_kbuf_recycle(req, issue_flags); 1095 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1085 1096 1086 - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1097 + sel.val = ret; 1098 + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) 1087 1099 goto retry_multishot; 1088 1100 1089 - return ret; 1101 + return sel.val; 1090 1102 } 1091 1103 1092 1104 static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, 1093 - size_t *len, unsigned int issue_flags) 1105 + struct io_br_sel *sel, unsigned int issue_flags) 1094 1106 { 1095 1107 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1096 1108 int ret; ··· 1116 1126 arg.mode |= KBUF_MODE_FREE; 1117 1127 } 1118 1128 1119 - if (*len) 1120 - arg.max_len = *len; 1129 + if (sel->val) 1130 + arg.max_len = sel->val; 1121 1131 else if (kmsg->msg.msg_inq > 1) 1122 - arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq); 1132 + arg.max_len = min_not_zero(sel->val, (ssize_t) kmsg->msg.msg_inq); 1123 1133 1124 1134 /* if mshot limited, ensure we don't go over */ 1125 1135 if (sr->flags & IORING_RECV_MSHOT_LIM) 1126 1136 arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len); 1127 - ret = io_buffers_peek(req, &arg); 1137 + ret = io_buffers_peek(req, &arg, sel); 1128 1138 if (unlikely(ret < 0)) 1129 1139 return ret; 1130 1140 ··· 1145 1155 iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, 1146 1156 arg.out_len); 1147 1157 } else { 1148 - void __user *buf; 1158 + size_t len = sel->val; 1149 1159 1150 - *len = sr->len; 1151 - buf = io_buffer_select(req, len, sr->buf_group, issue_flags); 1152 - if (!buf) 1160 + *sel = io_buffer_select(req, &len, sr->buf_group, issue_flags); 1161 + if (!sel->addr) 1153 1162 return -ENOBUFS; 1154 - sr->buf = buf; 1155 - sr->len = *len; 1163 + sr->buf = sel->addr; 1164 + sr->len = len; 1156 1165 map_ubuf: 1157 1166 ret = import_ubuf(ITER_DEST, sr->buf, sr->len, 1158 1167 &kmsg->msg.msg_iter); ··· 1166 1177 { 1167 1178 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1168 1179 struct io_async_msghdr *kmsg = req->async_data; 1180 + struct io_br_sel sel; 1169 1181 struct socket *sock; 1170 1182 unsigned flags; 1171 1183 int ret, min_ret = 0; 1172 1184 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 1173 - size_t len = sr->len; 1174 1185 bool mshot_finished; 1175 1186 1176 1187 if (!(req->flags & REQ_F_POLLED) && ··· 1186 1197 flags |= MSG_DONTWAIT; 1187 1198 1188 1199 retry_multishot: 1200 + sel.buf_list = NULL; 1189 1201 if (io_do_buffer_select(req)) { 1190 - ret = io_recv_buf_select(req, kmsg, &len, issue_flags); 1191 - if (unlikely(ret)) { 1202 + sel.val = sr->len; 1203 + ret = io_recv_buf_select(req, kmsg, &sel, issue_flags); 1204 + if (unlikely(ret < 0)) { 1192 1205 kmsg->msg.msg_inq = -1; 1193 1206 goto out_free; 1194 1207 } ··· 1206 1215 ret = sock_recvmsg(sock, &kmsg->msg, flags); 1207 1216 if (ret < min_ret) { 1208 1217 if (ret == -EAGAIN && force_nonblock) { 1209 - if (issue_flags & IO_URING_F_MULTISHOT) 1210 - io_kbuf_recycle(req, issue_flags); 1211 - 1218 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1212 1219 return IOU_RETRY; 1213 1220 } 1214 1221 if (ret > 0 && io_net_retry(sock, flags)) { 1215 1222 sr->len -= ret; 1216 1223 sr->buf += ret; 1217 1224 sr->done_io += ret; 1218 - return io_net_kbuf_recyle(req, kmsg, ret); 1225 + return io_net_kbuf_recyle(req, sel.buf_list, kmsg, ret); 1219 1226 } 1220 1227 if (ret == -ERESTARTSYS) 1221 1228 ret = -EINTR; ··· 1229 1240 else if (sr->done_io) 1230 1241 ret = sr->done_io; 1231 1242 else 1232 - io_kbuf_recycle(req, issue_flags); 1243 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1233 1244 1234 - if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) 1245 + sel.val = ret; 1246 + if (!io_recv_finish(req, kmsg, &sel, mshot_finished, issue_flags)) 1235 1247 goto retry_multishot; 1236 1248 1237 - return ret; 1249 + return sel.val; 1238 1250 } 1239 1251 1240 1252 int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ··· 1495 1505 zc->len -= ret; 1496 1506 zc->buf += ret; 1497 1507 zc->done_io += ret; 1498 - return io_net_kbuf_recyle(req, kmsg, ret); 1508 + return -EAGAIN; 1499 1509 } 1500 1510 if (ret == -ERESTARTSYS) 1501 1511 ret = -EINTR; ··· 1565 1575 1566 1576 if (ret > 0 && io_net_retry(sock, flags)) { 1567 1577 sr->done_io += ret; 1568 - return io_net_kbuf_recyle(req, kmsg, ret); 1578 + return -EAGAIN; 1569 1579 } 1570 1580 if (ret == -ERESTARTSYS) 1571 1581 ret = -EINTR;
+15 -2
io_uring/nop.c
··· 17 17 int result; 18 18 int fd; 19 19 unsigned int flags; 20 + __u64 extra1; 21 + __u64 extra2; 20 22 }; 21 23 22 24 #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ 23 25 IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ 24 - IORING_NOP_TW) 26 + IORING_NOP_TW | IORING_NOP_CQE32) 25 27 26 28 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 27 29 { ··· 43 41 nop->fd = -1; 44 42 if (nop->flags & IORING_NOP_FIXED_BUFFER) 45 43 req->buf_index = READ_ONCE(sqe->buf_index); 44 + if (nop->flags & IORING_NOP_CQE32) { 45 + struct io_ring_ctx *ctx = req->ctx; 46 + 47 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 48 + return -EINVAL; 49 + nop->extra1 = READ_ONCE(sqe->off); 50 + nop->extra2 = READ_ONCE(sqe->addr); 51 + } 46 52 return 0; 47 53 } 48 54 ··· 78 68 done: 79 69 if (ret < 0) 80 70 req_set_fail(req); 81 - io_req_set_res(req, nop->result, 0); 71 + if (nop->flags & IORING_NOP_CQE32) 72 + io_req_set_res32(req, nop->result, 0, nop->extra1, nop->extra2); 73 + else 74 + io_req_set_res(req, nop->result, 0); 82 75 if (nop->flags & IORING_NOP_TW) { 83 76 req->io_task_work.func = io_req_task_complete; 84 77 io_req_task_work_add(req);
+5
io_uring/notif.c
··· 14 14 static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) 15 15 { 16 16 struct io_notif_data *nd = io_notif_to_data(notif); 17 + struct io_ring_ctx *ctx = notif->ctx; 18 + 19 + lockdep_assert_held(&ctx->uring_lock); 17 20 18 21 do { 19 22 notif = cmd_to_io_kiocb(nd); 20 23 24 + if (WARN_ON_ONCE(ctx != notif->ctx)) 25 + return; 21 26 lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); 22 27 23 28 if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used))
+1
io_uring/opdef.c
··· 413 413 #endif 414 414 }, 415 415 [IORING_OP_URING_CMD] = { 416 + .buffer_select = 1, 416 417 .needs_file = 1, 417 418 .plug = 1, 418 419 .iopoll = 1,
+1
io_uring/openclose.c
··· 14 14 15 15 #include "../fs/internal.h" 16 16 17 + #include "filetable.h" 17 18 #include "io_uring.h" 18 19 #include "rsrc.h" 19 20 #include "openclose.h"
-4
io_uring/poll.c
··· 316 316 317 317 ret = io_poll_check_events(req, tw); 318 318 if (ret == IOU_POLL_NO_ACTION) { 319 - io_kbuf_recycle(req, 0); 320 319 return; 321 320 } else if (ret == IOU_POLL_REQUEUE) { 322 - io_kbuf_recycle(req, 0); 323 321 __io_poll_execute(req, 0); 324 322 return; 325 323 } ··· 683 685 req->flags &= ~(REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL); 684 686 req->flags |= REQ_F_POLLED; 685 687 ipt.pt._qproc = io_async_queue_proc; 686 - 687 - io_kbuf_recycle(req, issue_flags); 688 688 689 689 ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); 690 690 if (ret)
+101
io_uring/query.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include "linux/io_uring/query.h" 4 + 5 + #include "query.h" 6 + #include "io_uring.h" 7 + 8 + #define IO_MAX_QUERY_SIZE (sizeof(struct io_uring_query_opcode)) 9 + #define IO_MAX_QUERY_ENTRIES 1000 10 + 11 + static ssize_t io_query_ops(void *data) 12 + { 13 + struct io_uring_query_opcode *e = data; 14 + 15 + BUILD_BUG_ON(sizeof(*e) > IO_MAX_QUERY_SIZE); 16 + 17 + e->nr_request_opcodes = IORING_OP_LAST; 18 + e->nr_register_opcodes = IORING_REGISTER_LAST; 19 + e->feature_flags = IORING_FEAT_FLAGS; 20 + e->ring_setup_flags = IORING_SETUP_FLAGS; 21 + e->enter_flags = IORING_ENTER_FLAGS; 22 + e->sqe_flags = SQE_VALID_FLAGS; 23 + return sizeof(*e); 24 + } 25 + 26 + static int io_handle_query_entry(struct io_ring_ctx *ctx, 27 + void *data, void __user *uhdr, 28 + u64 *next_entry) 29 + { 30 + struct io_uring_query_hdr hdr; 31 + size_t usize, res_size = 0; 32 + ssize_t ret = -EINVAL; 33 + void __user *udata; 34 + 35 + if (copy_from_user(&hdr, uhdr, sizeof(hdr))) 36 + return -EFAULT; 37 + usize = hdr.size; 38 + hdr.size = min(hdr.size, IO_MAX_QUERY_SIZE); 39 + udata = u64_to_user_ptr(hdr.query_data); 40 + 41 + if (hdr.query_op >= __IO_URING_QUERY_MAX) { 42 + ret = -EOPNOTSUPP; 43 + goto out; 44 + } 45 + if (!mem_is_zero(hdr.__resv, sizeof(hdr.__resv)) || hdr.result || !hdr.size) 46 + goto out; 47 + if (copy_from_user(data, udata, hdr.size)) 48 + return -EFAULT; 49 + 50 + switch (hdr.query_op) { 51 + case IO_URING_QUERY_OPCODES: 52 + ret = io_query_ops(data); 53 + break; 54 + } 55 + 56 + if (ret >= 0) { 57 + if (WARN_ON_ONCE(ret > IO_MAX_QUERY_SIZE)) 58 + return -EFAULT; 59 + res_size = ret; 60 + ret = 0; 61 + } 62 + out: 63 + hdr.result = ret; 64 + hdr.size = min_t(size_t, usize, res_size); 65 + 66 + if (copy_struct_to_user(udata, usize, data, hdr.size, NULL)) 67 + return -EFAULT; 68 + if (copy_to_user(uhdr, &hdr, sizeof(hdr))) 69 + return -EFAULT; 70 + *next_entry = hdr.next_entry; 71 + return 0; 72 + } 73 + 74 + int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) 75 + { 76 + char entry_buffer[IO_MAX_QUERY_SIZE]; 77 + void __user *uhdr = arg; 78 + int ret, nr = 0; 79 + 80 + memset(entry_buffer, 0, sizeof(entry_buffer)); 81 + 82 + if (nr_args) 83 + return -EINVAL; 84 + 85 + while (uhdr) { 86 + u64 next_hdr; 87 + 88 + ret = io_handle_query_entry(ctx, entry_buffer, uhdr, &next_hdr); 89 + if (ret) 90 + return ret; 91 + uhdr = u64_to_user_ptr(next_hdr); 92 + 93 + /* Have some limit to avoid a potential cycle */ 94 + if (++nr >= IO_MAX_QUERY_ENTRIES) 95 + return -ERANGE; 96 + if (fatal_signal_pending(current)) 97 + return -EINTR; 98 + cond_resched(); 99 + } 100 + return 0; 101 + }
+9
io_uring/query.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IORING_QUERY_H 3 + #define IORING_QUERY_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + int io_query(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); 8 + 9 + #endif
+34 -26
io_uring/register.c
··· 18 18 #include <linux/io_uring.h> 19 19 #include <linux/io_uring_types.h> 20 20 21 + #include "filetable.h" 21 22 #include "io_uring.h" 22 23 #include "opdef.h" 23 24 #include "tctx.h" ··· 32 31 #include "msg_ring.h" 33 32 #include "memmap.h" 34 33 #include "zcrx.h" 34 + #include "query.h" 35 35 36 36 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ 37 37 IORING_REGISTER_LAST + IORING_OP_LAST) ··· 48 46 nr_args = IORING_OP_LAST; 49 47 50 48 size = struct_size(p, ops, nr_args); 51 - p = kzalloc(size, GFP_KERNEL); 52 - if (!p) 53 - return -ENOMEM; 54 - 55 - ret = -EFAULT; 56 - if (copy_from_user(p, arg, size)) 57 - goto out; 49 + p = memdup_user(arg, size); 50 + if (IS_ERR(p)) 51 + return PTR_ERR(p); 58 52 ret = -EINVAL; 59 53 if (memchr_inv(p, 0, size)) 60 54 goto out; ··· 394 396 395 397 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) 396 398 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ 397 - IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) 399 + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP | \ 400 + IORING_SETUP_CQE_MIXED) 398 401 399 402 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) 400 403 { ··· 406 407 struct io_uring_params p; 407 408 int ret; 408 409 409 - /* for single issuer, must be owner resizing */ 410 - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && 411 - current != ctx->submitter_task) 412 - return -EEXIST; 413 410 /* limited to DEFER_TASKRUN for now */ 414 411 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 415 412 return -EINVAL; ··· 830 835 break; 831 836 ret = io_register_mem_region(ctx, arg); 832 837 break; 838 + case IORING_REGISTER_QUERY: 839 + ret = io_query(ctx, arg, nr_args); 840 + break; 841 + case IORING_REGISTER_ZCRX_REFILL: 842 + ret = io_zcrx_return_bufs(ctx, arg, nr_args); 843 + break; 833 844 default: 834 845 ret = -EINVAL; 835 846 break; ··· 878 877 return ERR_PTR(-EOPNOTSUPP); 879 878 } 880 879 880 + static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) 881 + { 882 + struct io_uring_sqe sqe; 883 + 884 + if (!arg || nr_args != 1) 885 + return -EINVAL; 886 + if (copy_from_user(&sqe, arg, sizeof(sqe))) 887 + return -EFAULT; 888 + /* no flags supported */ 889 + if (sqe.flags) 890 + return -EINVAL; 891 + if (sqe.opcode != IORING_OP_MSG_RING) 892 + return -EINVAL; 893 + 894 + return io_uring_sync_msg_ring(&sqe); 895 + } 896 + 881 897 /* 882 898 * "blind" registration opcodes are ones where there's no ring given, and 883 899 * hence the source fd must be -1. ··· 903 885 unsigned int nr_args) 904 886 { 905 887 switch (opcode) { 906 - case IORING_REGISTER_SEND_MSG_RING: { 907 - struct io_uring_sqe sqe; 908 - 909 - if (!arg || nr_args != 1) 910 - return -EINVAL; 911 - if (copy_from_user(&sqe, arg, sizeof(sqe))) 912 - return -EFAULT; 913 - /* no flags supported */ 914 - if (sqe.flags) 915 - return -EINVAL; 916 - if (sqe.opcode == IORING_OP_MSG_RING) 917 - return io_uring_sync_msg_ring(&sqe); 918 - } 888 + case IORING_REGISTER_SEND_MSG_RING: 889 + return io_uring_register_send_msg_ring(arg, nr_args); 890 + case IORING_REGISTER_QUERY: 891 + return io_query(NULL, arg, nr_args); 919 892 } 920 - 921 893 return -EINVAL; 922 894 } 923 895
+8
io_uring/rsrc.c
··· 13 13 14 14 #include <uapi/linux/io_uring.h> 15 15 16 + #include "filetable.h" 16 17 #include "io_uring.h" 17 18 #include "openclose.h" 18 19 #include "rsrc.h" ··· 1300 1299 if (src_ctx != ctx) { 1301 1300 mutex_unlock(&ctx->uring_lock); 1302 1301 lock_two_rings(ctx, src_ctx); 1302 + 1303 + if (src_ctx->submitter_task && 1304 + src_ctx->submitter_task != current) { 1305 + ret = -EEXIST; 1306 + goto out; 1307 + } 1303 1308 } 1304 1309 1305 1310 ret = io_clone_buffers(ctx, src_ctx, &buf); 1306 1311 1312 + out: 1307 1313 if (src_ctx != ctx) 1308 1314 mutex_unlock(&src_ctx->uring_lock); 1309 1315
+35 -28
io_uring/rw.c
··· 15 15 16 16 #include <uapi/linux/io_uring.h> 17 17 18 + #include "filetable.h" 18 19 #include "io_uring.h" 19 20 #include "opdef.h" 20 21 #include "kbuf.h" ··· 108 107 } 109 108 110 109 static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, 111 - struct io_async_rw *io, 112 - unsigned int issue_flags) 110 + struct io_async_rw *io, struct io_br_sel *sel, 111 + unsigned int issue_flags) 113 112 { 114 113 const struct io_issue_def *def = &io_issue_defs[req->opcode]; 115 114 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 116 - void __user *buf = u64_to_user_ptr(rw->addr); 117 115 size_t sqe_len = rw->len; 118 116 117 + sel->addr = u64_to_user_ptr(rw->addr); 119 118 if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) 120 - return io_import_vec(ddir, req, io, buf, sqe_len); 119 + return io_import_vec(ddir, req, io, sel->addr, sqe_len); 121 120 122 121 if (io_do_buffer_select(req)) { 123 - buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); 124 - if (!buf) 122 + *sel = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); 123 + if (!sel->addr) 125 124 return -ENOBUFS; 126 - rw->addr = (unsigned long) buf; 125 + rw->addr = (unsigned long) sel->addr; 127 126 rw->len = sqe_len; 128 127 } 129 - return import_ubuf(ddir, buf, sqe_len, &io->iter); 128 + return import_ubuf(ddir, sel->addr, sqe_len, &io->iter); 130 129 } 131 130 132 131 static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, 133 132 struct io_async_rw *io, 133 + struct io_br_sel *sel, 134 134 unsigned int issue_flags) 135 135 { 136 136 int ret; 137 137 138 - ret = __io_import_rw_buffer(rw, req, io, issue_flags); 138 + ret = __io_import_rw_buffer(rw, req, io, sel, issue_flags); 139 139 if (unlikely(ret < 0)) 140 140 return ret; 141 141 ··· 155 153 if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) 156 154 io_vec_free(&rw->vec); 157 155 158 - if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { 159 - req->async_data = NULL; 160 - req->flags &= ~REQ_F_ASYNC_DATA; 161 - } 156 + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) 157 + io_req_async_data_clear(req, 0); 162 158 } 163 159 164 160 static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) ··· 306 306 307 307 static int io_rw_do_import(struct io_kiocb *req, int ddir) 308 308 { 309 + struct io_br_sel sel = { }; 310 + 309 311 if (io_do_buffer_select(req)) 310 312 return 0; 311 313 312 - return io_import_rw_buffer(ddir, req, req->async_data, 0); 314 + return io_import_rw_buffer(ddir, req, req->async_data, &sel, 0); 313 315 } 314 316 315 317 static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, ··· 578 576 io_req_io_end(req); 579 577 580 578 if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) 581 - req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); 579 + req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL); 582 580 583 581 io_req_rw_cleanup(req, 0); 584 582 io_req_task_complete(req, tw); ··· 647 645 } 648 646 649 647 static int kiocb_done(struct io_kiocb *req, ssize_t ret, 650 - unsigned int issue_flags) 648 + struct io_br_sel *sel, unsigned int issue_flags) 651 649 { 652 650 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 653 651 unsigned final_ret = io_fixup_rw_res(req, ret); ··· 661 659 * from the submission path. 662 660 */ 663 661 io_req_io_end(req); 664 - io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); 662 + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, sel->buf_list)); 665 663 io_req_rw_cleanup(req, issue_flags); 666 664 return IOU_COMPLETE; 667 665 } else { ··· 904 902 return 0; 905 903 } 906 904 907 - static int __io_read(struct io_kiocb *req, unsigned int issue_flags) 905 + static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, 906 + unsigned int issue_flags) 908 907 { 909 908 bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; 910 909 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); ··· 919 916 if (unlikely(ret)) 920 917 return ret; 921 918 } else if (io_do_buffer_select(req)) { 922 - ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); 919 + ret = io_import_rw_buffer(ITER_DEST, req, io, sel, issue_flags); 923 920 if (unlikely(ret < 0)) 924 921 return ret; 925 922 } ··· 1021 1018 1022 1019 int io_read(struct io_kiocb *req, unsigned int issue_flags) 1023 1020 { 1021 + struct io_br_sel sel = { }; 1024 1022 int ret; 1025 1023 1026 - ret = __io_read(req, issue_flags); 1024 + ret = __io_read(req, &sel, issue_flags); 1027 1025 if (ret >= 0) 1028 - return kiocb_done(req, ret, issue_flags); 1026 + return kiocb_done(req, ret, &sel, issue_flags); 1029 1027 1028 + if (req->flags & REQ_F_BUFFERS_COMMIT) 1029 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1030 1030 return ret; 1031 1031 } 1032 1032 1033 1033 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) 1034 1034 { 1035 1035 struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); 1036 + struct io_br_sel sel = { }; 1036 1037 unsigned int cflags = 0; 1037 1038 int ret; 1038 1039 ··· 1048 1041 1049 1042 /* make it sync, multishot doesn't support async execution */ 1050 1043 rw->kiocb.ki_complete = NULL; 1051 - ret = __io_read(req, issue_flags); 1044 + ret = __io_read(req, &sel, issue_flags); 1052 1045 1053 1046 /* 1054 1047 * If we get -EAGAIN, recycle our buffer and just let normal poll ··· 1059 1052 * Reset rw->len to 0 again to avoid clamping future mshot 1060 1053 * reads, in case the buffer size varies. 1061 1054 */ 1062 - if (io_kbuf_recycle(req, issue_flags)) 1055 + if (io_kbuf_recycle(req, sel.buf_list, issue_flags)) 1063 1056 rw->len = 0; 1064 1057 return IOU_RETRY; 1065 1058 } else if (ret <= 0) { 1066 - io_kbuf_recycle(req, issue_flags); 1059 + io_kbuf_recycle(req, sel.buf_list, issue_flags); 1067 1060 if (ret < 0) 1068 1061 req_set_fail(req); 1069 1062 } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { 1070 - cflags = io_put_kbuf(req, ret, issue_flags); 1063 + cflags = io_put_kbuf(req, ret, sel.buf_list); 1071 1064 } else { 1072 1065 /* 1073 1066 * Any successful return value will keep the multishot read ··· 1075 1068 * we fail to post a CQE, or multishot is no longer set, then 1076 1069 * jump to the termination path. This request is then done. 1077 1070 */ 1078 - cflags = io_put_kbuf(req, ret, issue_flags); 1071 + cflags = io_put_kbuf(req, ret, sel.buf_list); 1079 1072 rw->len = 0; /* similarly to above, reset len to 0 */ 1080 1073 1081 1074 if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { ··· 1204 1197 return -EAGAIN; 1205 1198 } 1206 1199 done: 1207 - return kiocb_done(req, ret2, issue_flags); 1200 + return kiocb_done(req, ret2, NULL, issue_flags); 1208 1201 } else { 1209 1202 ret_eagain: 1210 1203 iov_iter_restore(&io->iter, &io->iter_state); ··· 1372 1365 if (!smp_load_acquire(&req->iopoll_completed)) 1373 1366 break; 1374 1367 nr_events++; 1375 - req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0); 1368 + req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); 1376 1369 if (req->opcode != IORING_OP_URING_CMD) 1377 1370 io_req_rw_cleanup(req, 0); 1378 1371 }
+1
io_uring/splice.c
··· 11 11 12 12 #include <uapi/linux/io_uring.h> 13 13 14 + #include "filetable.h" 14 15 #include "io_uring.h" 15 16 #include "splice.h" 16 17
+74 -9
io_uring/uring_cmd.c
··· 11 11 #include "io_uring.h" 12 12 #include "alloc_cache.h" 13 13 #include "rsrc.h" 14 + #include "kbuf.h" 14 15 #include "uring_cmd.h" 15 16 #include "poll.h" 16 17 ··· 37 36 38 37 if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) { 39 38 ioucmd->sqe = NULL; 40 - req->async_data = NULL; 41 - req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); 39 + io_req_async_data_clear(req, REQ_F_NEED_CLEANUP); 42 40 } 43 41 } 44 42 ··· 126 126 } 127 127 128 128 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, 129 - void (*task_work_cb)(struct io_uring_cmd *, unsigned), 129 + io_uring_cmd_tw_t task_work_cb, 130 130 unsigned flags) 131 131 { 132 132 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ··· 151 151 * Called by consumers of io_uring_cmd, if they originally returned 152 152 * -EIOCBQUEUED upon receiving the command. 153 153 */ 154 - void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, 155 - unsigned issue_flags) 154 + void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, 155 + unsigned issue_flags, bool is_cqe32) 156 156 { 157 157 struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 158 158 ··· 165 165 req_set_fail(req); 166 166 167 167 io_req_set_res(req, ret, 0); 168 - if (req->ctx->flags & IORING_SETUP_CQE32) 168 + if (is_cqe32) { 169 + if (req->ctx->flags & IORING_SETUP_CQE_MIXED) 170 + req->cqe.flags |= IORING_CQE_F_32; 169 171 io_req_set_cqe32_extra(req, res2, 0); 172 + } 170 173 io_req_uring_cleanup(req, issue_flags); 171 174 if (req->ctx->flags & IORING_SETUP_IOPOLL) { 172 175 /* order with io_iopoll_req_issued() checking ->iopoll_complete */ ··· 183 180 io_req_task_work_add(req); 184 181 } 185 182 } 186 - EXPORT_SYMBOL_GPL(io_uring_cmd_done); 183 + EXPORT_SYMBOL_GPL(__io_uring_cmd_done); 187 184 188 185 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 189 186 { ··· 197 194 if (ioucmd->flags & ~IORING_URING_CMD_MASK) 198 195 return -EINVAL; 199 196 200 - if (ioucmd->flags & IORING_URING_CMD_FIXED) 197 + if (ioucmd->flags & IORING_URING_CMD_FIXED) { 198 + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) 199 + return -EINVAL; 201 200 req->buf_index = READ_ONCE(sqe->buf_index); 201 + } 202 + 203 + if (!!(ioucmd->flags & IORING_URING_CMD_MULTISHOT) != 204 + !!(req->flags & REQ_F_BUFFER_SELECT)) 205 + return -EINVAL; 202 206 203 207 ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); 204 208 ··· 244 234 245 235 if (ctx->flags & IORING_SETUP_SQE128) 246 236 issue_flags |= IO_URING_F_SQE128; 247 - if (ctx->flags & IORING_SETUP_CQE32) 237 + if (ctx->flags & (IORING_SETUP_CQE32 | IORING_SETUP_CQE_MIXED)) 248 238 issue_flags |= IO_URING_F_CQE32; 249 239 if (io_is_compat(ctx)) 250 240 issue_flags |= IO_URING_F_COMPAT; ··· 261 251 } 262 252 263 253 ret = file->f_op->uring_cmd(ioucmd, issue_flags); 254 + if (ioucmd->flags & IORING_URING_CMD_MULTISHOT) { 255 + if (ret >= 0) 256 + return IOU_ISSUE_SKIP_COMPLETE; 257 + } 264 258 if (ret == -EAGAIN) { 265 259 ioucmd->flags |= IORING_URING_CMD_REISSUE; 266 260 return ret; ··· 347 333 return false; 348 334 return io_req_post_cqe32(req, cqe); 349 335 } 336 + 337 + /* 338 + * Work with io_uring_mshot_cmd_post_cqe() together for committing the 339 + * provided buffer upfront 340 + */ 341 + struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, 342 + unsigned buf_group, size_t *len, 343 + unsigned int issue_flags) 344 + { 345 + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 346 + 347 + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) 348 + return (struct io_br_sel) { .val = -EINVAL }; 349 + 350 + if (WARN_ON_ONCE(!io_do_buffer_select(req))) 351 + return (struct io_br_sel) { .val = -EINVAL }; 352 + 353 + return io_buffer_select(req, len, buf_group, issue_flags); 354 + } 355 + EXPORT_SYMBOL_GPL(io_uring_cmd_buffer_select); 356 + 357 + /* 358 + * Return true if this multishot uring_cmd needs to be completed, otherwise 359 + * the event CQE is posted successfully. 360 + * 361 + * This function must use `struct io_br_sel` returned from 362 + * io_uring_cmd_buffer_select() for committing the buffer in the same 363 + * uring_cmd submission context. 364 + */ 365 + bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, 366 + struct io_br_sel *sel, unsigned int issue_flags) 367 + { 368 + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); 369 + unsigned int cflags = 0; 370 + 371 + if (!(ioucmd->flags & IORING_URING_CMD_MULTISHOT)) 372 + return true; 373 + 374 + if (sel->val > 0) { 375 + cflags = io_put_kbuf(req, sel->val, sel->buf_list); 376 + if (io_req_post_cqe(req, sel->val, cflags | IORING_CQE_F_MORE)) 377 + return false; 378 + } 379 + 380 + io_kbuf_recycle(req, sel->buf_list, issue_flags); 381 + if (sel->val < 0) 382 + req_set_fail(req); 383 + io_req_set_res(req, sel->val, cflags); 384 + return true; 385 + } 386 + EXPORT_SYMBOL_GPL(io_uring_mshot_cmd_post_cqe);
+1 -3
io_uring/waitid.c
··· 37 37 struct io_waitid_async *iwa = req->async_data; 38 38 39 39 put_pid(iwa->wo.wo_pid); 40 - kfree(req->async_data); 41 - req->async_data = NULL; 42 - req->flags &= ~REQ_F_ASYNC_DATA; 40 + io_req_async_data_free(req); 43 41 } 44 42 45 43 static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo)
+188 -107
io_uring/zcrx.c
··· 26 26 #include "zcrx.h" 27 27 #include "rsrc.h" 28 28 29 + #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 30 + 29 31 #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) 30 32 31 33 static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) ··· 45 43 static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) 46 44 { 47 45 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 46 + unsigned niov_pages_shift; 48 47 49 48 lockdep_assert(!area->mem.is_dmabuf); 50 49 51 - return area->mem.pages[net_iov_idx(niov)]; 50 + niov_pages_shift = area->ifq->niov_shift - PAGE_SHIFT; 51 + return area->mem.pages[net_iov_idx(niov) << niov_pages_shift]; 52 52 } 53 53 54 54 static int io_populate_area_dma(struct io_zcrx_ifq *ifq, 55 - struct io_zcrx_area *area, 56 - struct sg_table *sgt, unsigned long off) 55 + struct io_zcrx_area *area) 57 56 { 57 + unsigned niov_size = 1U << ifq->niov_shift; 58 + struct sg_table *sgt = area->mem.sgt; 58 59 struct scatterlist *sg; 59 60 unsigned i, niov_idx = 0; 60 61 61 62 for_each_sgtable_dma_sg(sgt, sg, i) { 62 63 dma_addr_t dma = sg_dma_address(sg); 63 64 unsigned long sg_len = sg_dma_len(sg); 64 - unsigned long sg_off = min(sg_len, off); 65 65 66 - off -= sg_off; 67 - sg_len -= sg_off; 68 - dma += sg_off; 66 + if (WARN_ON_ONCE(sg_len % niov_size)) 67 + return -EINVAL; 69 68 70 69 while (sg_len && niov_idx < area->nia.num_niovs) { 71 70 struct net_iov *niov = &area->nia.niovs[niov_idx]; 72 71 73 72 if (net_mp_niov_set_dma_addr(niov, dma)) 74 73 return -EFAULT; 75 - sg_len -= PAGE_SIZE; 76 - dma += PAGE_SIZE; 74 + sg_len -= niov_size; 75 + dma += niov_size; 77 76 niov_idx++; 78 77 } 79 78 } 79 + 80 + if (WARN_ON_ONCE(niov_idx != area->nia.num_niovs)) 81 + return -EFAULT; 80 82 return 0; 81 83 } 82 84 ··· 150 144 goto err; 151 145 } 152 146 153 - mem->dmabuf_offset = off; 154 147 mem->size = len; 155 148 return 0; 156 149 err: 157 150 io_release_dmabuf(mem); 158 151 return ret; 159 - } 160 - 161 - static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 162 - { 163 - if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 164 - return -EINVAL; 165 - return io_populate_area_dma(ifq, area, area->mem.sgt, 166 - area->mem.dmabuf_offset); 167 152 } 168 153 169 154 static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) ··· 203 206 if (ret < 0) 204 207 mem->account_pages = 0; 205 208 209 + mem->sgt = &mem->page_sg_table; 206 210 mem->pages = pages; 207 211 mem->nr_folios = nr_pages; 208 212 mem->size = area_reg->len; ··· 218 220 } 219 221 if (mem->pages) { 220 222 unpin_user_pages(mem->pages, mem->nr_folios); 221 - sg_free_table(&mem->page_sg_table); 223 + sg_free_table(mem->sgt); 224 + mem->sgt = NULL; 222 225 kvfree(mem->pages); 223 226 } 224 227 } ··· 229 230 struct io_uring_zcrx_area_reg *area_reg) 230 231 { 231 232 int ret; 233 + 234 + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 235 + return -EINVAL; 236 + if (area_reg->rq_area_token) 237 + return -EINVAL; 238 + if (area_reg->__resv2[0] || area_reg->__resv2[1]) 239 + return -EINVAL; 232 240 233 241 ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); 234 242 if (ret) ··· 253 247 { 254 248 int i; 255 249 256 - guard(mutex)(&ifq->dma_lock); 250 + guard(mutex)(&ifq->pp_lock); 257 251 if (!area->is_mapped) 258 252 return; 259 253 area->is_mapped = false; ··· 269 263 } 270 264 } 271 265 272 - static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 273 - { 274 - int ret; 275 - 276 - ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, 277 - DMA_FROM_DEVICE, IO_DMA_ATTR); 278 - if (ret < 0) 279 - return ret; 280 - return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0); 281 - } 282 - 283 266 static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 284 267 { 285 268 int ret; 286 269 287 - guard(mutex)(&ifq->dma_lock); 270 + guard(mutex)(&ifq->pp_lock); 288 271 if (area->is_mapped) 289 272 return 0; 290 273 291 - if (area->mem.is_dmabuf) 292 - ret = io_zcrx_map_area_dmabuf(ifq, area); 293 - else 294 - ret = io_zcrx_map_area_umem(ifq, area); 274 + if (!area->mem.is_dmabuf) { 275 + ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, 276 + DMA_FROM_DEVICE, IO_DMA_ATTR); 277 + if (ret < 0) 278 + return ret; 279 + } 295 280 281 + ret = io_populate_area_dma(ifq, area); 296 282 if (ret == 0) 297 283 area->is_mapped = true; 298 284 return ret; 299 285 } 300 286 301 - static void io_zcrx_sync_for_device(const struct page_pool *pool, 287 + static void io_zcrx_sync_for_device(struct page_pool *pool, 302 288 struct net_iov *niov) 303 289 { 304 290 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 305 291 dma_addr_t dma_addr; 306 292 293 + unsigned niov_size; 294 + 307 295 if (!dma_dev_need_sync(pool->p.dev)) 308 296 return; 309 297 298 + niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; 310 299 dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 311 300 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 312 - PAGE_SIZE, pool->p.dma_dir); 301 + niov_size, pool->p.dma_dir); 313 302 #endif 314 303 } 315 304 ··· 353 352 void *ptr; 354 353 int ret; 355 354 356 - off = sizeof(struct io_uring); 355 + off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); 357 356 size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; 358 357 if (size > rd->size) 359 358 return -EINVAL; ··· 368 367 ptr = io_region_get_ptr(&ifq->region); 369 368 ifq->rq_ring = (struct io_uring *)ptr; 370 369 ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 370 + 371 + reg->offsets.head = offsetof(struct io_uring, head); 372 + reg->offsets.tail = offsetof(struct io_uring, tail); 373 + reg->offsets.rqes = off; 371 374 return 0; 372 375 } 373 376 ··· 396 391 kfree(area); 397 392 } 398 393 399 - #define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) 394 + static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, 395 + struct io_zcrx_area *area) 396 + { 397 + if (ifq->area) 398 + return -EINVAL; 399 + ifq->area = area; 400 + return 0; 401 + } 400 402 401 403 static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, 402 - struct io_zcrx_area **res, 403 404 struct io_uring_zcrx_area_reg *area_reg) 404 405 { 405 406 struct io_zcrx_area *area; 406 407 unsigned nr_iovs; 407 408 int i, ret; 408 - 409 - if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) 410 - return -EINVAL; 411 - if (area_reg->rq_area_token) 412 - return -EINVAL; 413 - if (area_reg->__resv2[0] || area_reg->__resv2[1]) 414 - return -EINVAL; 415 409 416 410 ret = -ENOMEM; 417 411 area = kzalloc(sizeof(*area), GFP_KERNEL); ··· 422 418 if (ret) 423 419 goto err; 424 420 425 - nr_iovs = area->mem.size >> PAGE_SHIFT; 421 + ifq->niov_shift = PAGE_SHIFT; 422 + nr_iovs = area->mem.size >> ifq->niov_shift; 426 423 area->nia.num_niovs = nr_iovs; 427 424 428 425 ret = -ENOMEM; 429 426 area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), 430 - GFP_KERNEL | __GFP_ZERO); 427 + GFP_KERNEL_ACCOUNT | __GFP_ZERO); 431 428 if (!area->nia.niovs) 432 429 goto err; 433 430 434 431 area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), 435 - GFP_KERNEL | __GFP_ZERO); 432 + GFP_KERNEL_ACCOUNT | __GFP_ZERO); 436 433 if (!area->freelist) 437 434 goto err; 438 435 439 436 area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), 440 - GFP_KERNEL | __GFP_ZERO); 437 + GFP_KERNEL_ACCOUNT | __GFP_ZERO); 441 438 if (!area->user_refs) 442 439 goto err; 443 440 ··· 456 451 area->area_id = 0; 457 452 area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; 458 453 spin_lock_init(&area->freelist_lock); 459 - *res = area; 460 - return 0; 454 + 455 + ret = io_zcrx_append_area(ifq, area); 456 + if (!ret) 457 + return 0; 461 458 err: 462 459 if (area) 463 460 io_zcrx_free_area(area); ··· 476 469 477 470 ifq->if_rxq = -1; 478 471 ifq->ctx = ctx; 479 - spin_lock_init(&ifq->lock); 480 472 spin_lock_init(&ifq->rq_lock); 481 - mutex_init(&ifq->dma_lock); 473 + mutex_init(&ifq->pp_lock); 482 474 return ifq; 483 475 } 484 476 485 477 static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) 486 478 { 487 - spin_lock(&ifq->lock); 488 - if (ifq->netdev) { 489 - netdev_put(ifq->netdev, &ifq->netdev_tracker); 490 - ifq->netdev = NULL; 491 - } 492 - spin_unlock(&ifq->lock); 479 + guard(mutex)(&ifq->pp_lock); 480 + 481 + if (!ifq->netdev) 482 + return; 483 + netdev_put(ifq->netdev, &ifq->netdev_tracker); 484 + ifq->netdev = NULL; 493 485 } 494 486 495 487 static void io_close_queue(struct io_zcrx_ifq *ifq) ··· 503 497 if (ifq->if_rxq == -1) 504 498 return; 505 499 506 - spin_lock(&ifq->lock); 507 - netdev = ifq->netdev; 508 - netdev_tracker = ifq->netdev_tracker; 509 - ifq->netdev = NULL; 510 - spin_unlock(&ifq->lock); 500 + scoped_guard(mutex, &ifq->pp_lock) { 501 + netdev = ifq->netdev; 502 + netdev_tracker = ifq->netdev_tracker; 503 + ifq->netdev = NULL; 504 + } 511 505 512 506 if (netdev) { 513 507 net_mp_close_rxq(netdev, ifq->if_rxq, &p); ··· 519 513 static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) 520 514 { 521 515 io_close_queue(ifq); 522 - io_zcrx_drop_netdev(ifq); 523 516 524 517 if (ifq->area) 525 518 io_zcrx_free_area(ifq->area); ··· 526 521 put_device(ifq->dev); 527 522 528 523 io_free_rbuf_ring(ifq); 529 - mutex_destroy(&ifq->dma_lock); 524 + mutex_destroy(&ifq->pp_lock); 530 525 kfree(ifq); 531 526 } 532 527 ··· 559 554 return -EPERM; 560 555 561 556 /* mandatory io_uring features for zc rx */ 562 - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && 563 - ctx->flags & IORING_SETUP_CQE32)) 557 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 558 + return -EINVAL; 559 + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) 564 560 return -EINVAL; 565 561 if (copy_from_user(&reg, arg, sizeof(reg))) 566 562 return -EFAULT; 567 563 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 568 564 return -EFAULT; 569 - if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)) || 565 + if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || 570 566 reg.__resv2 || reg.zcrx_id) 571 567 return -EINVAL; 572 568 if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) ··· 612 606 } 613 607 get_device(ifq->dev); 614 608 615 - ret = io_zcrx_create_area(ifq, &ifq->area, &area); 609 + ret = io_zcrx_create_area(ifq, &area); 616 610 if (ret) 617 611 goto err; 618 612 ··· 623 617 goto err; 624 618 ifq->if_rxq = reg.if_rxq; 625 619 626 - reg.offsets.rqes = sizeof(struct io_uring); 627 - reg.offsets.head = offsetof(struct io_uring, head); 628 - reg.offsets.tail = offsetof(struct io_uring, tail); 629 620 reg.zcrx_id = id; 630 621 631 622 scoped_guard(mutex, &ctx->mmap_lock) { ··· 750 747 return &ifq->rqes[idx]; 751 748 } 752 749 750 + static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, 751 + struct io_zcrx_ifq *ifq, 752 + struct net_iov **ret_niov) 753 + { 754 + unsigned niov_idx, area_idx; 755 + struct io_zcrx_area *area; 756 + 757 + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 758 + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> ifq->niov_shift; 759 + 760 + if (unlikely(rqe->__pad || area_idx)) 761 + return false; 762 + area = ifq->area; 763 + 764 + if (unlikely(niov_idx >= area->nia.num_niovs)) 765 + return false; 766 + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 767 + 768 + *ret_niov = &area->nia.niovs[niov_idx]; 769 + return true; 770 + } 771 + 753 772 static void io_zcrx_ring_refill(struct page_pool *pp, 754 773 struct io_zcrx_ifq *ifq) 755 774 { 756 775 unsigned int mask = ifq->rq_entries - 1; 757 776 unsigned int entries; 758 - netmem_ref netmem; 759 777 760 - spin_lock_bh(&ifq->rq_lock); 778 + guard(spinlock_bh)(&ifq->rq_lock); 761 779 762 780 entries = io_zcrx_rqring_entries(ifq); 763 - entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); 764 - if (unlikely(!entries)) { 765 - spin_unlock_bh(&ifq->rq_lock); 781 + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); 782 + if (unlikely(!entries)) 766 783 return; 767 - } 768 784 769 785 do { 770 786 struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 771 - struct io_zcrx_area *area; 772 787 struct net_iov *niov; 773 - unsigned niov_idx, area_idx; 788 + netmem_ref netmem; 774 789 775 - area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; 776 - niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; 777 - 778 - if (unlikely(rqe->__pad || area_idx)) 790 + if (!io_parse_rqe(rqe, ifq, &niov)) 779 791 continue; 780 - area = ifq->area; 781 - 782 - if (unlikely(niov_idx >= area->nia.num_niovs)) 783 - continue; 784 - niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); 785 - 786 - niov = &area->nia.niovs[niov_idx]; 787 792 if (!io_zcrx_put_niov_uref(niov)) 788 793 continue; 789 794 790 795 netmem = net_iov_to_netmem(niov); 791 - if (page_pool_unref_netmem(netmem, 1) != 0) 796 + if (!page_pool_unref_and_test(netmem)) 792 797 continue; 793 798 794 799 if (unlikely(niov->pp != pp)) { ··· 809 798 } while (--entries); 810 799 811 800 smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 812 - spin_unlock_bh(&ifq->rq_lock); 813 801 } 814 802 815 803 static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) ··· 870 860 return -EINVAL; 871 861 if (WARN_ON_ONCE(!pp->dma_map)) 872 862 return -EOPNOTSUPP; 873 - if (pp->p.order != 0) 874 - return -EOPNOTSUPP; 863 + if (pp->p.order + PAGE_SHIFT != ifq->niov_shift) 864 + return -EINVAL; 875 865 if (pp->p.dma_dir != DMA_FROM_DEVICE) 876 866 return -EOPNOTSUPP; 877 867 ··· 927 917 .uninstall = io_pp_uninstall, 928 918 }; 929 919 920 + #define IO_ZCRX_MAX_SYS_REFILL_BUFS (1 << 16) 921 + #define IO_ZCRX_SYS_REFILL_BATCH 32 922 + 923 + static void io_return_buffers(struct io_zcrx_ifq *ifq, 924 + struct io_uring_zcrx_rqe *rqes, unsigned nr) 925 + { 926 + int i; 927 + 928 + for (i = 0; i < nr; i++) { 929 + struct net_iov *niov; 930 + netmem_ref netmem; 931 + 932 + if (!io_parse_rqe(&rqes[i], ifq, &niov)) 933 + continue; 934 + 935 + scoped_guard(spinlock_bh, &ifq->rq_lock) { 936 + if (!io_zcrx_put_niov_uref(niov)) 937 + continue; 938 + } 939 + 940 + netmem = net_iov_to_netmem(niov); 941 + if (!page_pool_unref_and_test(netmem)) 942 + continue; 943 + io_zcrx_return_niov(niov); 944 + } 945 + } 946 + 947 + int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 948 + void __user *arg, unsigned nr_arg) 949 + { 950 + struct io_uring_zcrx_rqe rqes[IO_ZCRX_SYS_REFILL_BATCH]; 951 + struct io_uring_zcrx_rqe __user *user_rqes; 952 + struct io_uring_zcrx_sync_refill zr; 953 + struct io_zcrx_ifq *ifq; 954 + unsigned nr, i; 955 + 956 + if (nr_arg) 957 + return -EINVAL; 958 + if (copy_from_user(&zr, arg, sizeof(zr))) 959 + return -EFAULT; 960 + if (!zr.nr_entries || zr.nr_entries > IO_ZCRX_MAX_SYS_REFILL_BUFS) 961 + return -EINVAL; 962 + if (!mem_is_zero(&zr.__resv, sizeof(zr.__resv))) 963 + return -EINVAL; 964 + 965 + ifq = xa_load(&ctx->zcrx_ctxs, zr.zcrx_id); 966 + if (!ifq) 967 + return -EINVAL; 968 + nr = zr.nr_entries; 969 + user_rqes = u64_to_user_ptr(zr.rqes); 970 + 971 + for (i = 0; i < nr;) { 972 + unsigned batch = min(nr - i, IO_ZCRX_SYS_REFILL_BATCH); 973 + size_t size = batch * sizeof(rqes[0]); 974 + 975 + if (copy_from_user(rqes, user_rqes + i, size)) 976 + return i ? i : -EFAULT; 977 + io_return_buffers(ifq, rqes, batch); 978 + 979 + i += batch; 980 + 981 + if (fatal_signal_pending(current)) 982 + return i; 983 + cond_resched(); 984 + } 985 + return nr; 986 + } 987 + 930 988 static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, 931 989 struct io_zcrx_ifq *ifq, int off, int len) 932 990 { 991 + struct io_ring_ctx *ctx = req->ctx; 933 992 struct io_uring_zcrx_cqe *rcqe; 934 993 struct io_zcrx_area *area; 935 994 struct io_uring_cqe *cqe; 936 995 u64 offset; 937 996 938 - if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) 997 + if (!io_defer_get_uncommited_cqe(ctx, &cqe)) 939 998 return false; 940 999 941 1000 cqe->user_data = req->cqe.user_data; 942 1001 cqe->res = len; 943 1002 cqe->flags = IORING_CQE_F_MORE; 1003 + if (ctx->flags & IORING_SETUP_CQE_MIXED) 1004 + cqe->flags |= IORING_CQE_F_32; 944 1005 945 1006 area = io_zcrx_iov_to_area(niov); 946 - offset = off + (net_iov_idx(niov) << PAGE_SHIFT); 1007 + offset = off + (net_iov_idx(niov) << ifq->niov_shift); 947 1008 rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); 948 1009 rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); 949 1010 rcqe->__pad = 0; 950 1011 return true; 951 1012 } 952 1013 953 - static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) 1014 + static struct net_iov *io_alloc_fallback_niov(struct io_zcrx_ifq *ifq) 954 1015 { 1016 + struct io_zcrx_area *area = ifq->area; 955 1017 struct net_iov *niov = NULL; 1018 + 1019 + if (area->mem.is_dmabuf) 1020 + return NULL; 956 1021 957 1022 spin_lock_bh(&area->freelist_lock); 958 1023 if (area->free_count) ··· 1088 1003 struct page *src_page, unsigned int src_offset, 1089 1004 size_t len) 1090 1005 { 1091 - struct io_zcrx_area *area = ifq->area; 1092 1006 size_t copied = 0; 1093 1007 int ret = 0; 1094 - 1095 - if (area->mem.is_dmabuf) 1096 - return -EFAULT; 1097 1008 1098 1009 while (len) { 1099 1010 struct io_copy_cache cc; 1100 1011 struct net_iov *niov; 1101 1012 size_t n; 1102 1013 1103 - niov = io_zcrx_alloc_fallback(area); 1014 + niov = io_alloc_fallback_niov(ifq); 1104 1015 if (!niov) { 1105 1016 ret = -ENOMEM; 1106 1017 break;
+15 -4
io_uring/zcrx.h
··· 16 16 unsigned long nr_folios; 17 17 struct sg_table page_sg_table; 18 18 unsigned long account_pages; 19 + struct sg_table *sgt; 19 20 20 21 struct dma_buf_attachment *attach; 21 22 struct dma_buf *dmabuf; 22 - struct sg_table *sgt; 23 - unsigned long dmabuf_offset; 24 23 }; 25 24 26 25 struct io_zcrx_area { ··· 41 42 struct io_zcrx_ifq { 42 43 struct io_ring_ctx *ctx; 43 44 struct io_zcrx_area *area; 45 + unsigned niov_shift; 44 46 45 47 spinlock_t rq_lock ____cacheline_aligned_in_smp; 46 48 struct io_uring *rq_ring; ··· 53 53 struct device *dev; 54 54 struct net_device *netdev; 55 55 netdevice_tracker netdev_tracker; 56 - spinlock_t lock; 57 - struct mutex dma_lock; 56 + 57 + /* 58 + * Page pool and net configuration lock, can be taken deeper in the 59 + * net stack. 60 + */ 61 + struct mutex pp_lock; 58 62 struct io_mapped_region region; 59 63 }; 60 64 61 65 #if defined(CONFIG_IO_URING_ZCRX) 66 + int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 67 + void __user *arg, unsigned nr_arg); 62 68 int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 63 69 struct io_uring_zcrx_ifq_reg __user *arg); 64 70 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); ··· 96 90 unsigned int id) 97 91 { 98 92 return NULL; 93 + } 94 + static inline int io_zcrx_return_bufs(struct io_ring_ctx *ctx, 95 + void __user *arg, unsigned nr_arg) 96 + { 97 + return -EOPNOTSUPP; 99 98 } 100 99 #endif 101 100