Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring updates from Jens Axboe:

- Add a callback driven main loop for io_uring, and BPF struct_ops
on top to allow implementing custom event loop logic

- Decouple IOPOLL from being a ring-wide all-or-nothing setting,
allowing IOPOLL use cases to also issue certain white listed
non-polled opcodes

- Timeout improvements. Migrate internal timeout storage from
timespec64 to ktime_t for simpler arithmetic and avoid copying of
timespec data

- Zero-copy receive (zcrx) updates:

- Add a device-less mode (ZCRX_REG_NODEV) for testing and
experimentation where data flows through the copy fallback path

- Fix two-step unregistration regression, DMA length calculations,
xarray mark usage, and a potential 32-bit overflow in id
shifting

- Refactoring toward multi-area support: dedicated refill queue
struct, consolidated DMA syncing, netmem array refilling format,
and guard-based locking

- Zero-copy transmit (zctx) cleanup:

- Unify io_send_zc() and io_sendmsg_zc() into a single function

- Add vectorized registered buffer send for IORING_OP_SEND_ZC

- Add separate notification user_data via sqe->addr3 so
notification and completion CQEs can be distinguished without
extra reference counting

- Switch struct io_ring_ctx internal bitfields to explicit flag bits
with atomic-safe accessors, and annotate the known harmless races on
those flags

- Various optimizations caching ctx and other request fields in local
variables to avoid repeated loads, and cleanups for tctx setup, ring
fd registration, and read path early returns

* tag 'for-7.1/io_uring-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (58 commits)
io_uring: unify getting ctx from passed in file descriptor
io_uring/register: don't get a reference to the registered ring fd
io_uring/tctx: clean up __io_uring_add_tctx_node() error handling
io_uring/tctx: have io_uring_alloc_task_context() return tctx
io_uring/timeout: use 'ctx' consistently
io_uring/rw: clean up __io_read() obsolete comment and early returns
io_uring/zcrx: use correct mmap off constants
io_uring/zcrx: use dma_len for chunk size calculation
io_uring/zcrx: don't clear not allocated niovs
io_uring/zcrx: don't use mark0 for allocating xarray
io_uring: cast id to u64 before shifting in io_allocate_rbuf_ring()
io_uring/zcrx: reject REG_NODEV with large rx_buf_size
io_uring/cancel: validate opcode for IORING_ASYNC_CANCEL_OP
io_uring/rsrc: use io_cache_free() to free node
io_uring/zcrx: rename zcrx [un]register functions
io_uring/zcrx: check ctrl op payload struct sizes
io_uring/zcrx: cache fallback availability in zcrx ctx
io_uring/zcrx: warn on a repeated area append
io_uring/zcrx: consolidate dma syncing
io_uring/zcrx: netmem array as refiling format
...

+1197 -630
-4
drivers/nvme/host/ioctl.c
··· 786 786 struct nvme_ctrl *ctrl = ioucmd->file->private_data; 787 787 int ret; 788 788 789 - /* IOPOLL not supported yet */ 790 - if (issue_flags & IO_URING_F_IOPOLL) 791 - return -EOPNOTSUPP; 792 - 793 789 ret = nvme_uring_cmd_checks(issue_flags); 794 790 if (ret) 795 791 return ret;
+33 -14
include/linux/io_uring_types.h
··· 8 8 #include <linux/llist.h> 9 9 #include <uapi/linux/io_uring.h> 10 10 11 + struct iou_loop_params; 12 + struct io_uring_bpf_ops; 13 + 11 14 enum { 12 15 /* 13 16 * A hint to not wake right away but delay until there are enough of ··· 43 40 IO_URING_F_CANCEL = (1 << 11), 44 41 IO_URING_F_COMPAT = (1 << 12), 45 42 }; 43 + 44 + struct iou_loop_params; 46 45 47 46 struct io_wq_work_node { 48 47 struct io_wq_work_node *next; ··· 273 268 unsigned int init_clear; 274 269 }; 275 270 271 + enum { 272 + IO_RING_F_DRAIN_NEXT = BIT(0), 273 + IO_RING_F_OP_RESTRICTED = BIT(1), 274 + IO_RING_F_REG_RESTRICTED = BIT(2), 275 + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), 276 + IO_RING_F_DRAIN_ACTIVE = BIT(4), 277 + IO_RING_F_HAS_EVFD = BIT(5), 278 + /* all CQEs should be posted only by the submitter task */ 279 + IO_RING_F_TASK_COMPLETE = BIT(6), 280 + IO_RING_F_LOCKLESS_CQ = BIT(7), 281 + IO_RING_F_SYSCALL_IOPOLL = BIT(8), 282 + IO_RING_F_POLL_ACTIVATED = BIT(9), 283 + IO_RING_F_DRAIN_DISABLED = BIT(10), 284 + IO_RING_F_COMPAT = BIT(11), 285 + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), 286 + }; 287 + 276 288 struct io_ring_ctx { 277 289 /* const or read-mostly hot data */ 278 290 struct { 291 + /* ring setup flags */ 279 292 unsigned int flags; 280 - unsigned int drain_next: 1; 281 - unsigned int op_restricted: 1; 282 - unsigned int reg_restricted: 1; 283 - unsigned int off_timeout_used: 1; 284 - unsigned int drain_active: 1; 285 - unsigned int has_evfd: 1; 286 - /* all CQEs should be posted only by the submitter task */ 287 - unsigned int task_complete: 1; 288 - unsigned int lockless_cq: 1; 289 - unsigned int syscall_iopoll: 1; 290 - unsigned int poll_activated: 1; 291 - unsigned int drain_disabled: 1; 292 - unsigned int compat: 1; 293 - unsigned int iowq_limits_set : 1; 293 + /* internal state flags IO_RING_F_* flags , mostly read-only */ 294 + unsigned int int_flags; 294 295 295 296 struct task_struct *submitter_task; 296 297 struct io_rings *rings; ··· 365 354 struct io_alloc_cache netmsg_cache; 366 355 struct io_alloc_cache rw_cache; 367 356 struct io_alloc_cache cmd_cache; 357 + 358 + int (*loop_step)(struct io_ring_ctx *ctx, 359 + struct iou_loop_params *); 368 360 369 361 /* 370 362 * Any cancelable uring_cmd is added to this list in ··· 491 477 DECLARE_HASHTABLE(napi_ht, 4); 492 478 #endif 493 479 480 + struct io_uring_bpf_ops *bpf_ops; 481 + 494 482 /* 495 483 * Protection for resize vs mmap races - both the mmap and resize 496 484 * side will need to grab this lock, to prevent either side from ··· 561 545 REQ_F_HAS_METADATA_BIT, 562 546 REQ_F_IMPORT_BUFFER_BIT, 563 547 REQ_F_SQE_COPIED_BIT, 548 + REQ_F_IOPOLL_BIT, 564 549 565 550 /* not a real bit, just to check we're not overflowing the space */ 566 551 __REQ_F_LAST_BIT, ··· 655 638 REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), 656 639 /* ->sqe_copy() has been called, if necessary */ 657 640 REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), 641 + /* request must be iopolled to completion (set in ->issue()) */ 642 + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), 658 643 }; 659 644 660 645 struct io_tw_req {
+7 -94
include/uapi/linux/io_uring.h
··· 10 10 11 11 #include <linux/fs.h> 12 12 #include <linux/types.h> 13 + #include <linux/io_uring/zcrx.h> 14 + 13 15 /* 14 16 * this file is shared with liburing and that has to autodetect 15 17 * if linux/time_types.h is available or not, it can ··· 343 341 344 342 /* 345 343 * sqe->timeout_flags 344 + * 345 + * IORING_TIMEOUT_IMMEDIATE_ARG: If set, sqe->addr stores the timeout 346 + * value in nanoseconds instead of 347 + * pointing to a timespec. 346 348 */ 347 349 #define IORING_TIMEOUT_ABS (1U << 0) 348 350 #define IORING_TIMEOUT_UPDATE (1U << 1) ··· 355 349 #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) 356 350 #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) 357 351 #define IORING_TIMEOUT_MULTISHOT (1U << 6) 352 + #define IORING_TIMEOUT_IMMEDIATE_ARG (1U << 7) 358 353 #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) 359 354 #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) 360 355 /* ··· 1055 1048 struct io_timespec { 1056 1049 __u64 tv_sec; 1057 1050 __u64 tv_nsec; 1058 - }; 1059 - 1060 - /* Zero copy receive refill queue entry */ 1061 - struct io_uring_zcrx_rqe { 1062 - __u64 off; 1063 - __u32 len; 1064 - __u32 __pad; 1065 - }; 1066 - 1067 - struct io_uring_zcrx_cqe { 1068 - __u64 off; 1069 - __u64 __pad; 1070 - }; 1071 - 1072 - /* The bit from which area id is encoded into offsets */ 1073 - #define IORING_ZCRX_AREA_SHIFT 48 1074 - #define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) 1075 - 1076 - struct io_uring_zcrx_offsets { 1077 - __u32 head; 1078 - __u32 tail; 1079 - __u32 rqes; 1080 - __u32 __resv2; 1081 - __u64 __resv[2]; 1082 - }; 1083 - 1084 - enum io_uring_zcrx_area_flags { 1085 - IORING_ZCRX_AREA_DMABUF = 1, 1086 - }; 1087 - 1088 - struct io_uring_zcrx_area_reg { 1089 - __u64 addr; 1090 - __u64 len; 1091 - __u64 rq_area_token; 1092 - __u32 flags; 1093 - __u32 dmabuf_fd; 1094 - __u64 __resv2[2]; 1095 - }; 1096 - 1097 - enum zcrx_reg_flags { 1098 - ZCRX_REG_IMPORT = 1, 1099 - }; 1100 - 1101 - enum zcrx_features { 1102 - /* 1103 - * The user can ask for the desired rx page size by passing the 1104 - * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. 1105 - */ 1106 - ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, 1107 - }; 1108 - 1109 - /* 1110 - * Argument for IORING_REGISTER_ZCRX_IFQ 1111 - */ 1112 - struct io_uring_zcrx_ifq_reg { 1113 - __u32 if_idx; 1114 - __u32 if_rxq; 1115 - __u32 rq_entries; 1116 - __u32 flags; 1117 - 1118 - __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ 1119 - __u64 region_ptr; /* struct io_uring_region_desc * */ 1120 - 1121 - struct io_uring_zcrx_offsets offsets; 1122 - __u32 zcrx_id; 1123 - __u32 rx_buf_len; 1124 - __u64 __resv[3]; 1125 - }; 1126 - 1127 - enum zcrx_ctrl_op { 1128 - ZCRX_CTRL_FLUSH_RQ, 1129 - ZCRX_CTRL_EXPORT, 1130 - 1131 - __ZCRX_CTRL_LAST, 1132 - }; 1133 - 1134 - struct zcrx_ctrl_flush_rq { 1135 - __u64 __resv[6]; 1136 - }; 1137 - 1138 - struct zcrx_ctrl_export { 1139 - __u32 zcrx_fd; 1140 - __u32 __resv1[11]; 1141 - }; 1142 - 1143 - struct zcrx_ctrl { 1144 - __u32 zcrx_id; 1145 - __u32 op; /* see enum zcrx_ctrl_op */ 1146 - __u64 __resv[2]; 1147 - 1148 - union { 1149 - struct zcrx_ctrl_export zc_export; 1150 - struct zcrx_ctrl_flush_rq zc_flush; 1151 - }; 1152 1051 }; 1153 1052 1154 1053 #ifdef __cplusplus
+115
include/uapi/linux/io_uring/zcrx.h
··· 1 + /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ 2 + /* 3 + * Header file for the io_uring zerocopy receive (zcrx) interface. 4 + * 5 + * Copyright (C) 2026 Pavel Begunkov 6 + * Copyright (C) 2026 David Wei 7 + * Copyright (C) Meta Platforms, Inc. 8 + */ 9 + #ifndef LINUX_IO_ZCRX_H 10 + #define LINUX_IO_ZCRX_H 11 + 12 + #include <linux/types.h> 13 + 14 + /* Zero copy receive refill queue entry */ 15 + struct io_uring_zcrx_rqe { 16 + __u64 off; 17 + __u32 len; 18 + __u32 __pad; 19 + }; 20 + 21 + struct io_uring_zcrx_cqe { 22 + __u64 off; 23 + __u64 __pad; 24 + }; 25 + 26 + /* The bit from which area id is encoded into offsets */ 27 + #define IORING_ZCRX_AREA_SHIFT 48 28 + #define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1)) 29 + 30 + struct io_uring_zcrx_offsets { 31 + __u32 head; 32 + __u32 tail; 33 + __u32 rqes; 34 + __u32 __resv2; 35 + __u64 __resv[2]; 36 + }; 37 + 38 + enum io_uring_zcrx_area_flags { 39 + IORING_ZCRX_AREA_DMABUF = 1, 40 + }; 41 + 42 + struct io_uring_zcrx_area_reg { 43 + __u64 addr; 44 + __u64 len; 45 + __u64 rq_area_token; 46 + __u32 flags; 47 + __u32 dmabuf_fd; 48 + __u64 __resv2[2]; 49 + }; 50 + 51 + enum zcrx_reg_flags { 52 + ZCRX_REG_IMPORT = 1, 53 + 54 + /* 55 + * Register a zcrx instance without a net device. All data will be 56 + * copied. The refill queue entries might not be automatically 57 + * consumed and need to be flushed, see ZCRX_CTRL_FLUSH_RQ. 58 + */ 59 + ZCRX_REG_NODEV = 2, 60 + }; 61 + 62 + enum zcrx_features { 63 + /* 64 + * The user can ask for the desired rx page size by passing the 65 + * value in struct io_uring_zcrx_ifq_reg::rx_buf_len. 66 + */ 67 + ZCRX_FEATURE_RX_PAGE_SIZE = 1 << 0, 68 + }; 69 + 70 + /* 71 + * Argument for IORING_REGISTER_ZCRX_IFQ 72 + */ 73 + struct io_uring_zcrx_ifq_reg { 74 + __u32 if_idx; 75 + __u32 if_rxq; 76 + __u32 rq_entries; 77 + __u32 flags; 78 + 79 + __u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */ 80 + __u64 region_ptr; /* struct io_uring_region_desc * */ 81 + 82 + struct io_uring_zcrx_offsets offsets; 83 + __u32 zcrx_id; 84 + __u32 rx_buf_len; 85 + __u64 __resv[3]; 86 + }; 87 + 88 + enum zcrx_ctrl_op { 89 + ZCRX_CTRL_FLUSH_RQ, 90 + ZCRX_CTRL_EXPORT, 91 + 92 + __ZCRX_CTRL_LAST, 93 + }; 94 + 95 + struct zcrx_ctrl_flush_rq { 96 + __u64 __resv[6]; 97 + }; 98 + 99 + struct zcrx_ctrl_export { 100 + __u32 zcrx_fd; 101 + __u32 __resv1[11]; 102 + }; 103 + 104 + struct zcrx_ctrl { 105 + __u32 zcrx_id; 106 + __u32 op; /* see enum zcrx_ctrl_op */ 107 + __u64 __resv[2]; 108 + 109 + union { 110 + struct zcrx_ctrl_export zc_export; 111 + struct zcrx_ctrl_flush_rq zc_flush; 112 + }; 113 + }; 114 + 115 + #endif /* LINUX_IO_ZCRX_H */
+5
io_uring/Kconfig
··· 14 14 def_bool y 15 15 depends on BPF 16 16 depends on NET 17 + 18 + config IO_URING_BPF_OPS 19 + def_bool y 20 + depends on IO_URING 21 + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+2 -1
io_uring/Makefile
··· 14 14 advise.o openclose.o statx.o timeout.o \ 15 15 cancel.o waitid.o register.o \ 16 16 truncate.o memmap.o alloc_cache.o \ 17 - query.o 17 + query.o loop.o 18 18 19 19 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 20 20 obj-$(CONFIG_IO_WQ) += io-wq.o ··· 25 25 obj-$(CONFIG_PROC_FS) += fdinfo.o 26 26 obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o 27 27 obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o 28 + obj-$(CONFIG_IO_URING_BPF_OPS) += bpf-ops.o
+270
io_uring/bpf-ops.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #include <linux/mutex.h> 3 + #include <linux/bpf.h> 4 + #include <linux/bpf_verifier.h> 5 + 6 + #include "io_uring.h" 7 + #include "register.h" 8 + #include "loop.h" 9 + #include "memmap.h" 10 + #include "bpf-ops.h" 11 + 12 + static DEFINE_MUTEX(io_bpf_ctrl_mutex); 13 + static const struct btf_type *loop_params_type; 14 + 15 + __bpf_kfunc_start_defs(); 16 + 17 + __bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx, u32 nr) 18 + { 19 + return io_submit_sqes(ctx, nr); 20 + } 21 + 22 + __bpf_kfunc 23 + __u8 *bpf_io_uring_get_region(struct io_ring_ctx *ctx, __u32 region_id, 24 + const size_t rdwr_buf_size) 25 + { 26 + struct io_mapped_region *r; 27 + 28 + lockdep_assert_held(&ctx->uring_lock); 29 + 30 + switch (region_id) { 31 + case IOU_REGION_MEM: 32 + r = &ctx->param_region; 33 + break; 34 + case IOU_REGION_CQ: 35 + r = &ctx->ring_region; 36 + break; 37 + case IOU_REGION_SQ: 38 + r = &ctx->sq_region; 39 + break; 40 + default: 41 + return NULL; 42 + } 43 + 44 + if (unlikely(rdwr_buf_size > io_region_size(r))) 45 + return NULL; 46 + return io_region_get_ptr(r); 47 + } 48 + 49 + __bpf_kfunc_end_defs(); 50 + 51 + BTF_KFUNCS_START(io_uring_kfunc_set) 52 + BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE); 53 + BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL); 54 + BTF_KFUNCS_END(io_uring_kfunc_set) 55 + 56 + static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = { 57 + .owner = THIS_MODULE, 58 + .set = &io_uring_kfunc_set, 59 + }; 60 + 61 + static int io_bpf_ops__loop_step(struct io_ring_ctx *ctx, 62 + struct iou_loop_params *lp) 63 + { 64 + return IOU_LOOP_STOP; 65 + } 66 + 67 + static struct io_uring_bpf_ops io_bpf_ops_stubs = { 68 + .loop_step = io_bpf_ops__loop_step, 69 + }; 70 + 71 + static bool bpf_io_is_valid_access(int off, int size, 72 + enum bpf_access_type type, 73 + const struct bpf_prog *prog, 74 + struct bpf_insn_access_aux *info) 75 + { 76 + if (type != BPF_READ) 77 + return false; 78 + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) 79 + return false; 80 + if (off % size != 0) 81 + return false; 82 + 83 + return btf_ctx_access(off, size, type, prog, info); 84 + } 85 + 86 + static int bpf_io_btf_struct_access(struct bpf_verifier_log *log, 87 + const struct bpf_reg_state *reg, int off, 88 + int size) 89 + { 90 + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); 91 + 92 + if (t == loop_params_type) { 93 + if (off + size <= offsetofend(struct iou_loop_params, cq_wait_idx)) 94 + return SCALAR_VALUE; 95 + } 96 + 97 + return -EACCES; 98 + } 99 + 100 + static const struct bpf_verifier_ops bpf_io_verifier_ops = { 101 + .get_func_proto = bpf_base_func_proto, 102 + .is_valid_access = bpf_io_is_valid_access, 103 + .btf_struct_access = bpf_io_btf_struct_access, 104 + }; 105 + 106 + static const struct btf_type * 107 + io_lookup_struct_type(struct btf *btf, const char *name) 108 + { 109 + s32 type_id; 110 + 111 + type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT); 112 + if (type_id < 0) 113 + return NULL; 114 + return btf_type_by_id(btf, type_id); 115 + } 116 + 117 + static int bpf_io_init(struct btf *btf) 118 + { 119 + int ret; 120 + 121 + loop_params_type = io_lookup_struct_type(btf, "iou_loop_params"); 122 + if (!loop_params_type) { 123 + pr_err("io_uring: Failed to locate iou_loop_params\n"); 124 + return -EINVAL; 125 + } 126 + 127 + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, 128 + &bpf_io_uring_kfunc_set); 129 + if (ret) { 130 + pr_err("io_uring: Failed to register kfuncs (%d)\n", ret); 131 + return ret; 132 + } 133 + return 0; 134 + } 135 + 136 + static int bpf_io_check_member(const struct btf_type *t, 137 + const struct btf_member *member, 138 + const struct bpf_prog *prog) 139 + { 140 + return 0; 141 + } 142 + 143 + static int bpf_io_init_member(const struct btf_type *t, 144 + const struct btf_member *member, 145 + void *kdata, const void *udata) 146 + { 147 + u32 moff = __btf_member_bit_offset(t, member) / 8; 148 + const struct io_uring_bpf_ops *uops = udata; 149 + struct io_uring_bpf_ops *ops = kdata; 150 + 151 + switch (moff) { 152 + case offsetof(struct io_uring_bpf_ops, ring_fd): 153 + ops->ring_fd = uops->ring_fd; 154 + return 1; 155 + } 156 + return 0; 157 + } 158 + 159 + static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) 160 + { 161 + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) 162 + return -EOPNOTSUPP; 163 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 164 + return -EOPNOTSUPP; 165 + 166 + if (ctx->bpf_ops) 167 + return -EBUSY; 168 + if (WARN_ON_ONCE(!ops->loop_step)) 169 + return -EINVAL; 170 + 171 + ops->priv = ctx; 172 + ctx->bpf_ops = ops; 173 + ctx->loop_step = ops->loop_step; 174 + return 0; 175 + } 176 + 177 + static int bpf_io_reg(void *kdata, struct bpf_link *link) 178 + { 179 + struct io_uring_bpf_ops *ops = kdata; 180 + struct io_ring_ctx *ctx; 181 + struct file *file; 182 + int ret = -EBUSY; 183 + 184 + file = io_uring_ctx_get_file(ops->ring_fd, false); 185 + if (IS_ERR(file)) 186 + return PTR_ERR(file); 187 + ctx = file->private_data; 188 + 189 + scoped_guard(mutex, &io_bpf_ctrl_mutex) { 190 + guard(mutex)(&ctx->uring_lock); 191 + ret = io_install_bpf(ctx, ops); 192 + } 193 + 194 + fput(file); 195 + return ret; 196 + } 197 + 198 + static void io_eject_bpf(struct io_ring_ctx *ctx) 199 + { 200 + struct io_uring_bpf_ops *ops = ctx->bpf_ops; 201 + 202 + if (WARN_ON_ONCE(!ops)) 203 + return; 204 + if (WARN_ON_ONCE(ops->priv != ctx)) 205 + return; 206 + 207 + ops->priv = NULL; 208 + ctx->bpf_ops = NULL; 209 + ctx->loop_step = NULL; 210 + } 211 + 212 + static void bpf_io_unreg(void *kdata, struct bpf_link *link) 213 + { 214 + struct io_uring_bpf_ops *ops = kdata; 215 + struct io_ring_ctx *ctx; 216 + 217 + guard(mutex)(&io_bpf_ctrl_mutex); 218 + ctx = ops->priv; 219 + if (ctx) { 220 + guard(mutex)(&ctx->uring_lock); 221 + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) 222 + return; 223 + 224 + io_eject_bpf(ctx); 225 + } 226 + } 227 + 228 + void io_unregister_bpf_ops(struct io_ring_ctx *ctx) 229 + { 230 + /* 231 + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, 232 + * and read protected by either. Try to avoid taking the global lock 233 + * for rings that never had any bpf installed. 234 + */ 235 + scoped_guard(mutex, &ctx->uring_lock) { 236 + if (!ctx->bpf_ops) 237 + return; 238 + } 239 + 240 + guard(mutex)(&io_bpf_ctrl_mutex); 241 + guard(mutex)(&ctx->uring_lock); 242 + if (ctx->bpf_ops) 243 + io_eject_bpf(ctx); 244 + } 245 + 246 + static struct bpf_struct_ops bpf_ring_ops = { 247 + .verifier_ops = &bpf_io_verifier_ops, 248 + .reg = bpf_io_reg, 249 + .unreg = bpf_io_unreg, 250 + .check_member = bpf_io_check_member, 251 + .init_member = bpf_io_init_member, 252 + .init = bpf_io_init, 253 + .cfi_stubs = &io_bpf_ops_stubs, 254 + .name = "io_uring_bpf_ops", 255 + .owner = THIS_MODULE, 256 + }; 257 + 258 + static int __init io_uring_bpf_init(void) 259 + { 260 + int ret; 261 + 262 + ret = register_bpf_struct_ops(&bpf_ring_ops, io_uring_bpf_ops); 263 + if (ret) { 264 + pr_err("io_uring: Failed to register struct_ops (%d)\n", ret); 265 + return ret; 266 + } 267 + 268 + return 0; 269 + } 270 + __initcall(io_uring_bpf_init);
+28
io_uring/bpf-ops.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IOU_BPF_OPS_H 3 + #define IOU_BPF_OPS_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + enum { 8 + IOU_REGION_MEM, 9 + IOU_REGION_CQ, 10 + IOU_REGION_SQ, 11 + }; 12 + 13 + struct io_uring_bpf_ops { 14 + int (*loop_step)(struct io_ring_ctx *ctx, struct iou_loop_params *lp); 15 + 16 + __u32 ring_fd; 17 + void *priv; 18 + }; 19 + 20 + #ifdef CONFIG_IO_URING_BPF_OPS 21 + void io_unregister_bpf_ops(struct io_ring_ctx *ctx); 22 + #else 23 + static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) 24 + { 25 + } 26 + #endif 27 + 28 + #endif /* IOU_BPF_OPS_H */
+8 -1
io_uring/cancel.c
··· 156 156 cancel->fd = READ_ONCE(sqe->fd); 157 157 } 158 158 if (cancel->flags & IORING_ASYNC_CANCEL_OP) { 159 + u32 op; 160 + 159 161 if (cancel->flags & IORING_ASYNC_CANCEL_ANY) 160 162 return -EINVAL; 161 - cancel->opcode = READ_ONCE(sqe->len); 163 + 164 + op = READ_ONCE(sqe->len); 165 + if (op >= IORING_OP_LAST) 166 + return -EINVAL; 167 + 168 + cancel->opcode = op; 162 169 } 163 170 164 171 return 0;
+17 -17
io_uring/cmd_net.c
··· 7 7 #include "uring_cmd.h" 8 8 #include "io_uring.h" 9 9 10 + static int io_uring_cmd_get_sock_ioctl(struct socket *sock, int op) 11 + { 12 + struct sock *sk = sock->sk; 13 + struct proto *prot = READ_ONCE(sk->sk_prot); 14 + int ret, arg = 0; 15 + 16 + if (!prot || !prot->ioctl) 17 + return -EOPNOTSUPP; 18 + 19 + ret = prot->ioctl(sk, op, &arg); 20 + if (ret) 21 + return ret; 22 + return arg; 23 + } 24 + 10 25 static inline int io_uring_cmd_getsockopt(struct socket *sock, 11 26 struct io_uring_cmd *cmd, 12 27 unsigned int issue_flags) ··· 171 156 int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) 172 157 { 173 158 struct socket *sock = cmd->file->private_data; 174 - struct sock *sk = sock->sk; 175 - struct proto *prot = READ_ONCE(sk->sk_prot); 176 - int ret, arg = 0; 177 159 178 160 switch (cmd->cmd_op) { 179 161 case SOCKET_URING_OP_SIOCINQ: 180 - if (!prot || !prot->ioctl) 181 - return -EOPNOTSUPP; 182 - 183 - ret = prot->ioctl(sk, SIOCINQ, &arg); 184 - if (ret) 185 - return ret; 186 - return arg; 162 + return io_uring_cmd_get_sock_ioctl(sock, SIOCINQ); 187 163 case SOCKET_URING_OP_SIOCOUTQ: 188 - if (!prot || !prot->ioctl) 189 - return -EOPNOTSUPP; 190 - 191 - ret = prot->ioctl(sk, SIOCOUTQ, &arg); 192 - if (ret) 193 - return ret; 194 - return arg; 164 + return io_uring_cmd_get_sock_ioctl(sock, SIOCOUTQ); 195 165 case SOCKET_URING_OP_GETSOCKOPT: 196 166 return io_uring_cmd_getsockopt(sock, cmd, issue_flags); 197 167 case SOCKET_URING_OP_SETSOCKOPT:
+2 -2
io_uring/eventfd.c
··· 148 148 spin_unlock(&ctx->completion_lock); 149 149 150 150 ev_fd->eventfd_async = eventfd_async; 151 - ctx->has_evfd = true; 151 + ctx->int_flags |= IO_RING_F_HAS_EVFD; 152 152 refcount_set(&ev_fd->refs, 1); 153 153 atomic_set(&ev_fd->ops, 0); 154 154 rcu_assign_pointer(ctx->io_ev_fd, ev_fd); ··· 162 162 ev_fd = rcu_dereference_protected(ctx->io_ev_fd, 163 163 lockdep_is_held(&ctx->uring_lock)); 164 164 if (ev_fd) { 165 - ctx->has_evfd = false; 165 + ctx->int_flags &= ~IO_RING_F_HAS_EVFD; 166 166 rcu_assign_pointer(ctx->io_ev_fd, NULL); 167 167 io_eventfd_put(ev_fd); 168 168 return 0;
+107 -78
io_uring/io_uring.c
··· 87 87 #include "msg_ring.h" 88 88 #include "memmap.h" 89 89 #include "zcrx.h" 90 + #include "bpf-ops.h" 90 91 91 92 #include "timeout.h" 92 93 #include "poll.h" ··· 96 95 #include "eventfd.h" 97 96 #include "wait.h" 98 97 #include "bpf_filter.h" 98 + #include "loop.h" 99 99 100 100 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 101 101 IOSQE_IO_HARDLINK | IOSQE_ASYNC) ··· 358 356 static void io_prep_async_work(struct io_kiocb *req) 359 357 { 360 358 const struct io_issue_def *def = &io_issue_defs[req->opcode]; 361 - struct io_ring_ctx *ctx = req->ctx; 362 359 363 360 if (!(req->flags & REQ_F_CREDS)) { 364 361 req->flags |= REQ_F_CREDS; ··· 379 378 if (should_hash && (req->file->f_flags & O_DIRECT) && 380 379 (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) 381 380 should_hash = false; 382 - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) 381 + if (should_hash || (req->flags & REQ_F_IOPOLL)) 383 382 io_wq_hash_work(&req->work, file_inode(req->file)); 384 383 } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { 385 384 if (def->unbound_nonreg_file) ··· 478 477 479 478 void __io_commit_cqring_flush(struct io_ring_ctx *ctx) 480 479 { 481 - if (ctx->poll_activated) 480 + if (ctx->int_flags & IO_RING_F_POLL_ACTIVATED) 482 481 io_poll_wq_wake(ctx); 483 - if (ctx->off_timeout_used) 482 + if (ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED) 484 483 io_flush_timeouts(ctx); 485 - if (ctx->has_evfd) 484 + if (ctx->int_flags & IO_RING_F_HAS_EVFD) 486 485 io_eventfd_signal(ctx, true); 487 486 } 488 487 489 488 static inline void __io_cq_lock(struct io_ring_ctx *ctx) 490 489 { 491 - if (!ctx->lockless_cq) 490 + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) 492 491 spin_lock(&ctx->completion_lock); 493 492 } 494 493 ··· 501 500 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) 502 501 { 503 502 io_commit_cqring(ctx); 504 - if (!ctx->task_complete) { 505 - if (!ctx->lockless_cq) 503 + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { 504 + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) 506 505 spin_unlock(&ctx->completion_lock); 507 506 /* IOPOLL rings only need to wake up if it's also SQPOLL */ 508 - if (!ctx->syscall_iopoll) 507 + if (!(ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL)) 509 508 io_cqring_wake(ctx); 510 509 } 511 510 io_commit_cqring_flush(ctx); ··· 588 587 mutex_lock(&ctx->uring_lock); 589 588 __io_cqring_overflow_flush(ctx, false); 590 589 mutex_unlock(&ctx->uring_lock); 590 + } 591 + 592 + void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) 593 + { 594 + __io_cqring_overflow_flush(ctx, false); 591 595 } 592 596 593 597 /* must to be called somewhat shortly after putting a request */ ··· 836 830 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) 837 831 { 838 832 lockdep_assert_held(&ctx->uring_lock); 839 - lockdep_assert(ctx->lockless_cq); 833 + lockdep_assert(ctx->int_flags & IO_RING_F_LOCKLESS_CQ); 840 834 841 835 if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { 842 836 struct io_cqe cqe = io_init_cqe(user_data, res, cflags); ··· 866 860 lockdep_assert(!io_wq_current_is_worker()); 867 861 lockdep_assert_held(&ctx->uring_lock); 868 862 869 - if (!ctx->lockless_cq) { 863 + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { 870 864 spin_lock(&ctx->completion_lock); 871 865 posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); 872 866 spin_unlock(&ctx->completion_lock); ··· 891 885 lockdep_assert_held(&ctx->uring_lock); 892 886 893 887 cqe[0].user_data = req->cqe.user_data; 894 - if (!ctx->lockless_cq) { 888 + if (!(ctx->int_flags & IO_RING_F_LOCKLESS_CQ)) { 895 889 spin_lock(&ctx->completion_lock); 896 890 posted = io_fill_cqe_aux32(ctx, cqe); 897 891 spin_unlock(&ctx->completion_lock); ··· 919 913 * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires 920 914 * the submitter task context, IOPOLL protects with uring_lock. 921 915 */ 922 - if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { 916 + if ((ctx->int_flags & IO_RING_F_LOCKLESS_CQ) || (req->flags & REQ_F_REISSUE)) { 923 917 defer_complete: 924 918 req->io_task_work.func = io_req_task_complete; 925 919 io_req_task_work_add(req); ··· 1073 1067 1074 1068 static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) 1075 1069 { 1070 + struct io_ring_ctx *ctx = req->ctx; 1071 + 1076 1072 if (req->file_node) { 1077 - io_put_rsrc_node(req->ctx, req->file_node); 1073 + io_put_rsrc_node(ctx, req->file_node); 1078 1074 req->file_node = NULL; 1079 1075 } 1080 1076 if (req->flags & REQ_F_BUF_NODE) 1081 - io_put_rsrc_node(req->ctx, req->buf_node); 1077 + io_put_rsrc_node(ctx, req->buf_node); 1082 1078 } 1083 1079 1084 1080 static void io_free_batch_list(struct io_ring_ctx *ctx, ··· 1143 1135 */ 1144 1136 if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && 1145 1137 unlikely(!io_fill_cqe_req(ctx, req))) { 1146 - if (ctx->lockless_cq) 1138 + if (ctx->int_flags & IO_RING_F_LOCKLESS_CQ) 1147 1139 io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); 1148 1140 else 1149 1141 io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); ··· 1156 1148 INIT_WQ_LIST(&state->compl_reqs); 1157 1149 } 1158 1150 1159 - if (unlikely(ctx->drain_active)) 1151 + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) 1160 1152 io_queue_deferred(ctx); 1161 1153 1162 1154 ctx->submit_state.cq_flush = false; ··· 1195 1187 1196 1188 static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) 1197 1189 { 1198 - unsigned int nr_events = 0; 1199 1190 unsigned long check_cq; 1200 1191 1201 1192 min_events = min(min_events, ctx->cq_entries); ··· 1237 1230 * very same mutex. 1238 1231 */ 1239 1232 if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { 1240 - u32 tail = ctx->cached_cq_tail; 1241 - 1242 1233 (void) io_run_local_work_locked(ctx, min_events); 1243 1234 1244 1235 if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) { ··· 1245 1240 mutex_lock(&ctx->uring_lock); 1246 1241 } 1247 1242 /* some requests don't go through iopoll_list */ 1248 - if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list)) 1243 + if (list_empty(&ctx->iopoll_list)) 1249 1244 break; 1250 1245 } 1251 1246 ret = io_do_iopoll(ctx, !min_events); ··· 1256 1251 return -EINTR; 1257 1252 if (need_resched()) 1258 1253 break; 1259 - 1260 - nr_events += ret; 1261 - } while (nr_events < min_events); 1254 + } while (io_cqring_events(ctx) < min_events); 1262 1255 1263 1256 return 0; 1264 1257 } ··· 1347 1344 list_add_tail(&de->list, &ctx->defer_list); 1348 1345 io_queue_deferred(ctx); 1349 1346 if (!drain && list_empty(&ctx->defer_list)) 1350 - ctx->drain_active = false; 1347 + ctx->int_flags &= ~IO_RING_F_DRAIN_ACTIVE; 1351 1348 } 1352 1349 1353 1350 static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, ··· 1421 1418 if (ret == IOU_ISSUE_SKIP_COMPLETE) { 1422 1419 ret = 0; 1423 1420 1424 - /* If the op doesn't have a file, we're not polling for it */ 1425 - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) 1421 + if (req->flags & REQ_F_IOPOLL) 1426 1422 io_iopoll_req_issued(req, issue_flags); 1427 1423 } 1428 1424 return ret; ··· 1437 1435 io_tw_lock(req->ctx, tw); 1438 1436 1439 1437 WARN_ON_ONCE(!req->file); 1440 - if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) 1438 + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) 1441 1439 return -EFAULT; 1442 1440 1443 1441 ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); ··· 1535 1533 * wait for request slots on the block side. 1536 1534 */ 1537 1535 if (!needs_poll) { 1538 - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) 1536 + if (!(req->flags & REQ_F_IOPOLL)) 1539 1537 break; 1540 1538 if (io_wq_worker_stopped()) 1541 1539 break; ··· 1657 1655 } else { 1658 1656 /* can't fail with IO_URING_F_INLINE */ 1659 1657 io_req_sqe_copy(req, IO_URING_F_INLINE); 1660 - if (unlikely(req->ctx->drain_active)) 1658 + if (unlikely(req->ctx->int_flags & IO_RING_F_DRAIN_ACTIVE)) 1661 1659 io_drain_req(req); 1662 1660 else 1663 1661 io_queue_iowq(req); ··· 1673 1671 struct io_kiocb *req, 1674 1672 unsigned int sqe_flags) 1675 1673 { 1676 - if (!ctx->op_restricted) 1674 + if (!(ctx->int_flags & IO_RING_F_OP_RESTRICTED)) 1677 1675 return true; 1678 1676 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 1679 1677 return false; ··· 1693 1691 { 1694 1692 struct io_kiocb *head = ctx->submit_state.link.head; 1695 1693 1696 - ctx->drain_active = true; 1694 + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; 1697 1695 if (head) { 1698 1696 /* 1699 1697 * If we need to drain a request in the middle of a link, drain ··· 1703 1701 * link. 1704 1702 */ 1705 1703 head->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; 1706 - ctx->drain_next = true; 1704 + ctx->int_flags |= IO_RING_F_DRAIN_NEXT; 1707 1705 } 1708 1706 } 1709 1707 ··· 1769 1767 req->buf_index = READ_ONCE(sqe->buf_group); 1770 1768 } 1771 1769 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) 1772 - ctx->drain_disabled = true; 1770 + ctx->int_flags |= IO_RING_F_DRAIN_DISABLED; 1773 1771 if (sqe_flags & IOSQE_IO_DRAIN) { 1774 - if (ctx->drain_disabled) 1772 + if (ctx->int_flags & IO_RING_F_DRAIN_DISABLED) 1775 1773 return io_init_fail_req(req, -EOPNOTSUPP); 1776 1774 io_init_drain(ctx); 1777 1775 } 1778 1776 } 1779 - if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { 1777 + if (unlikely(ctx->int_flags & (IO_RING_F_OP_RESTRICTED | IO_RING_F_DRAIN_ACTIVE | IO_RING_F_DRAIN_NEXT))) { 1780 1778 if (!io_check_restriction(ctx, req, sqe_flags)) 1781 1779 return io_init_fail_req(req, -EACCES); 1782 1780 /* knock it to the slow queue path, will be drained there */ 1783 - if (ctx->drain_active) 1781 + if (ctx->int_flags & IO_RING_F_DRAIN_ACTIVE) 1784 1782 req->flags |= REQ_F_FORCE_ASYNC; 1785 1783 /* if there is no link, we're at "next" request and need to drain */ 1786 - if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { 1787 - ctx->drain_next = false; 1788 - ctx->drain_active = true; 1784 + if (unlikely(ctx->int_flags & IO_RING_F_DRAIN_NEXT) && !ctx->submit_state.link.head) { 1785 + ctx->int_flags &= ~IO_RING_F_DRAIN_NEXT; 1786 + ctx->int_flags |= IO_RING_F_DRAIN_ACTIVE; 1789 1787 req->flags |= REQ_F_IO_DRAIN | REQ_F_FORCE_ASYNC; 1790 1788 } 1791 1789 } ··· 2150 2148 2151 2149 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) 2152 2150 { 2151 + io_unregister_bpf_ops(ctx); 2153 2152 io_sq_thread_finish(ctx); 2154 2153 2155 2154 mutex_lock(&ctx->uring_lock); 2156 2155 io_sqe_buffers_unregister(ctx); 2157 2156 io_sqe_files_unregister(ctx); 2158 - io_unregister_zcrx_ifqs(ctx); 2157 + io_unregister_zcrx(ctx); 2159 2158 io_cqring_overflow_kill(ctx); 2160 2159 io_eventfd_unregister(ctx); 2161 2160 io_free_alloc_caches(ctx); ··· 2207 2204 poll_wq_task_work); 2208 2205 2209 2206 mutex_lock(&ctx->uring_lock); 2210 - ctx->poll_activated = true; 2207 + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; 2211 2208 mutex_unlock(&ctx->uring_lock); 2212 2209 2213 2210 /* ··· 2222 2219 { 2223 2220 spin_lock(&ctx->completion_lock); 2224 2221 /* already activated or in progress */ 2225 - if (ctx->poll_activated || ctx->poll_wq_task_work.func) 2222 + if ((ctx->int_flags & IO_RING_F_POLL_ACTIVATED) || ctx->poll_wq_task_work.func) 2226 2223 goto out; 2227 - if (WARN_ON_ONCE(!ctx->task_complete)) 2224 + if (WARN_ON_ONCE(!(ctx->int_flags & IO_RING_F_TASK_COMPLETE))) 2228 2225 goto out; 2229 2226 if (!ctx->submitter_task) 2230 2227 goto out; ··· 2245 2242 struct io_ring_ctx *ctx = file->private_data; 2246 2243 __poll_t mask = 0; 2247 2244 2248 - if (unlikely(!ctx->poll_activated)) 2245 + if (unlikely(!(data_race(ctx->int_flags) & IO_RING_F_POLL_ACTIVATED))) 2249 2246 io_activate_pollwq(ctx); 2250 2247 /* 2251 2248 * provides mb() which pairs with barrier from wq_has_sleeper ··· 2310 2307 struct io_tctx_exit exit; 2311 2308 struct io_tctx_node *node; 2312 2309 int ret; 2310 + 2311 + mutex_lock(&ctx->uring_lock); 2312 + io_terminate_zcrx(ctx); 2313 + mutex_unlock(&ctx->uring_lock); 2313 2314 2314 2315 /* 2315 2316 * If we're doing polled IO and end up having requests being ··· 2546 2539 #endif 2547 2540 } 2548 2541 2542 + /* 2543 + * Given an 'fd' value, return the ctx associated with if. If 'registered' is 2544 + * true, then the registered index is used. Otherwise, the normal fd table. 2545 + * Caller must call fput() on the returned file if it isn't a registered file, 2546 + * unless it's an ERR_PTR. 2547 + */ 2548 + struct file *io_uring_ctx_get_file(unsigned int fd, bool registered) 2549 + { 2550 + struct file *file; 2551 + 2552 + if (registered) { 2553 + /* 2554 + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 2555 + * need only dereference our task private array to find it. 2556 + */ 2557 + struct io_uring_task *tctx = current->io_uring; 2558 + 2559 + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 2560 + return ERR_PTR(-EINVAL); 2561 + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 2562 + file = tctx->registered_rings[fd]; 2563 + } else { 2564 + file = fget(fd); 2565 + } 2566 + 2567 + if (unlikely(!file)) 2568 + return ERR_PTR(-EBADF); 2569 + if (io_is_uring_fops(file)) 2570 + return file; 2571 + fput(file); 2572 + return ERR_PTR(-EOPNOTSUPP); 2573 + } 2574 + 2575 + 2549 2576 SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, 2550 2577 u32, min_complete, u32, flags, const void __user *, argp, 2551 2578 size_t, argsz) ··· 2591 2550 if (unlikely(flags & ~IORING_ENTER_FLAGS)) 2592 2551 return -EINVAL; 2593 2552 2594 - /* 2595 - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 2596 - * need only dereference our task private array to find it. 2597 - */ 2598 - if (flags & IORING_ENTER_REGISTERED_RING) { 2599 - struct io_uring_task *tctx = current->io_uring; 2600 - 2601 - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 2602 - return -EINVAL; 2603 - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 2604 - file = tctx->registered_rings[fd]; 2605 - if (unlikely(!file)) 2606 - return -EBADF; 2607 - } else { 2608 - file = fget(fd); 2609 - if (unlikely(!file)) 2610 - return -EBADF; 2611 - ret = -EOPNOTSUPP; 2612 - if (unlikely(!io_is_uring_fops(file))) 2613 - goto out; 2614 - } 2615 - 2553 + file = io_uring_ctx_get_file(fd, flags & IORING_ENTER_REGISTERED_RING); 2554 + if (IS_ERR(file)) 2555 + return PTR_ERR(file); 2616 2556 ctx = file->private_data; 2617 2557 ret = -EBADFD; 2618 2558 /* ··· 2602 2580 */ 2603 2581 if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) 2604 2582 goto out; 2583 + 2584 + if (io_has_loop_ops(ctx)) { 2585 + ret = io_run_loop(ctx); 2586 + goto out; 2587 + } 2605 2588 2606 2589 /* 2607 2590 * For SQ polling, the thread will do all submissions and completions. ··· 2637 2610 goto out; 2638 2611 } 2639 2612 if (flags & IORING_ENTER_GETEVENTS) { 2640 - if (ctx->syscall_iopoll) 2613 + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) 2641 2614 goto iopoll_locked; 2642 2615 /* 2643 2616 * Ignore errors, we'll soon call io_cqring_wait() and ··· 2652 2625 if (flags & IORING_ENTER_GETEVENTS) { 2653 2626 int ret2; 2654 2627 2655 - if (ctx->syscall_iopoll) { 2628 + if (ctx->int_flags & IO_RING_F_SYSCALL_IOPOLL) { 2656 2629 /* 2657 2630 * We disallow the app entering submit/complete with 2658 2631 * polling, but we still need to lock the ring to ··· 2953 2926 if (dst->bpf_filters) 2954 2927 WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters); 2955 2928 if (dst->op_registered) 2956 - ctx->op_restricted = 1; 2929 + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; 2957 2930 if (dst->reg_registered) 2958 - ctx->reg_restricted = 1; 2931 + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; 2959 2932 } 2960 2933 2961 2934 static __cold int io_uring_create(struct io_ctx_config *config) ··· 2982 2955 2983 2956 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 2984 2957 !(ctx->flags & IORING_SETUP_IOPOLL)) 2985 - ctx->task_complete = true; 2958 + ctx->int_flags |= IO_RING_F_TASK_COMPLETE; 2986 2959 2987 - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) 2988 - ctx->lockless_cq = true; 2960 + if ((ctx->int_flags & IO_RING_F_TASK_COMPLETE) || 2961 + (ctx->flags & IORING_SETUP_IOPOLL)) 2962 + ctx->int_flags |= IO_RING_F_LOCKLESS_CQ; 2989 2963 2990 2964 /* 2991 2965 * lazy poll_wq activation relies on ->task_complete for synchronisation 2992 2966 * purposes, see io_activate_pollwq() 2993 2967 */ 2994 - if (!ctx->task_complete) 2995 - ctx->poll_activated = true; 2968 + if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) 2969 + ctx->int_flags |= IO_RING_F_POLL_ACTIVATED; 2996 2970 2997 2971 /* 2998 2972 * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user ··· 3003 2975 */ 3004 2976 if (ctx->flags & IORING_SETUP_IOPOLL && 3005 2977 !(ctx->flags & IORING_SETUP_SQPOLL)) 3006 - ctx->syscall_iopoll = 1; 2978 + ctx->int_flags |= IO_RING_F_SYSCALL_IOPOLL; 3007 2979 3008 - ctx->compat = in_compat_syscall(); 2980 + if (in_compat_syscall()) 2981 + ctx->int_flags |= IO_RING_F_COMPAT; 3009 2982 if (!ns_capable_noaudit(&init_user_ns, CAP_IPC_LOCK)) 3010 2983 ctx->user = get_uid(current_user()); 3011 2984
+7 -4
io_uring/io_uring.h
··· 185 185 struct file *io_file_get_normal(struct io_kiocb *req, int fd); 186 186 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, 187 187 unsigned issue_flags); 188 + struct file *io_uring_ctx_get_file(unsigned int fd, bool registered); 188 189 189 190 void io_req_task_queue(struct io_kiocb *req); 190 191 void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); ··· 224 223 225 224 if (ctx->flags & IORING_SETUP_IOPOLL) { 226 225 lockdep_assert_held(&ctx->uring_lock); 227 - } else if (!ctx->task_complete) { 226 + } else if (!(ctx->int_flags & IO_RING_F_TASK_COMPLETE)) { 228 227 lockdep_assert_held(&ctx->completion_lock); 229 228 } else if (ctx->submitter_task) { 230 229 /* ··· 241 240 242 241 static inline bool io_is_compat(struct io_ring_ctx *ctx) 243 242 { 244 - return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); 243 + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->int_flags & IO_RING_F_COMPAT); 245 244 } 246 245 247 246 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) ··· 495 494 wq_list_add_tail(&req->comp_list, &state->compl_reqs); 496 495 } 497 496 497 + #define SHOULD_FLUSH_MASK (IO_RING_F_OFF_TIMEOUT_USED | \ 498 + IO_RING_F_HAS_EVFD | IO_RING_F_POLL_ACTIVATED) 499 + 498 500 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) 499 501 { 500 - if (unlikely(ctx->off_timeout_used || 501 - ctx->has_evfd || ctx->poll_activated)) 502 + if (unlikely(data_race(ctx->int_flags) & SHOULD_FLUSH_MASK)) 502 503 __io_commit_cqring_flush(ctx); 503 504 } 504 505
+2 -2
io_uring/kbuf.c
··· 230 230 struct io_br_sel sel = { }; 231 231 struct io_buffer_list *bl; 232 232 233 - io_ring_submit_lock(req->ctx, issue_flags); 233 + io_ring_submit_lock(ctx, issue_flags); 234 234 235 235 bl = io_buffer_get_list(ctx, buf_group); 236 236 if (likely(bl)) { ··· 239 239 else 240 240 sel.addr = io_provided_buffer_select(req, len, bl); 241 241 } 242 - io_ring_submit_unlock(req->ctx, issue_flags); 242 + io_ring_submit_unlock(ctx, issue_flags); 243 243 return sel; 244 244 } 245 245
+91
io_uring/loop.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #include "io_uring.h" 3 + #include "wait.h" 4 + #include "loop.h" 5 + 6 + static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, 7 + const struct iou_loop_params *lp) 8 + { 9 + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); 10 + } 11 + 12 + static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) 13 + { 14 + atomic_set(&ctx->cq_wait_nr, nr_wait); 15 + set_current_state(TASK_INTERRUPTIBLE); 16 + } 17 + 18 + static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) 19 + { 20 + __set_current_state(TASK_RUNNING); 21 + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 22 + } 23 + 24 + static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, 25 + unsigned nr_wait) 26 + { 27 + io_loop_wait_start(ctx, nr_wait); 28 + 29 + if (unlikely(io_local_work_pending(ctx) || 30 + io_loop_nr_cqes(ctx, lp) <= 0) || 31 + READ_ONCE(ctx->check_cq)) { 32 + io_loop_wait_finish(ctx); 33 + return; 34 + } 35 + 36 + mutex_unlock(&ctx->uring_lock); 37 + schedule(); 38 + io_loop_wait_finish(ctx); 39 + mutex_lock(&ctx->uring_lock); 40 + } 41 + 42 + static int __io_run_loop(struct io_ring_ctx *ctx) 43 + { 44 + struct iou_loop_params lp = {}; 45 + 46 + while (true) { 47 + int nr_wait, step_res; 48 + 49 + if (unlikely(!ctx->loop_step)) 50 + return -EFAULT; 51 + 52 + step_res = ctx->loop_step(ctx, &lp); 53 + if (step_res == IOU_LOOP_STOP) 54 + break; 55 + if (step_res != IOU_LOOP_CONTINUE) 56 + return -EINVAL; 57 + 58 + nr_wait = io_loop_nr_cqes(ctx, &lp); 59 + if (nr_wait > 0) 60 + io_loop_wait(ctx, &lp, nr_wait); 61 + else 62 + nr_wait = 0; 63 + 64 + if (task_work_pending(current)) { 65 + mutex_unlock(&ctx->uring_lock); 66 + io_run_task_work(); 67 + mutex_lock(&ctx->uring_lock); 68 + } 69 + if (unlikely(task_sigpending(current))) 70 + return -EINTR; 71 + io_run_local_work_locked(ctx, nr_wait); 72 + 73 + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 74 + io_cqring_overflow_flush_locked(ctx); 75 + } 76 + 77 + return 0; 78 + } 79 + 80 + int io_run_loop(struct io_ring_ctx *ctx) 81 + { 82 + int ret; 83 + 84 + if (!io_allowed_run_tw(ctx)) 85 + return -EEXIST; 86 + 87 + mutex_lock(&ctx->uring_lock); 88 + ret = __io_run_loop(ctx); 89 + mutex_unlock(&ctx->uring_lock); 90 + return ret; 91 + }
+27
io_uring/loop.h
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IOU_LOOP_H 3 + #define IOU_LOOP_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + struct iou_loop_params { 8 + /* 9 + * The CQE index to wait for. Only serves as a hint and can still be 10 + * woken up earlier. 11 + */ 12 + __u32 cq_wait_idx; 13 + }; 14 + 15 + enum { 16 + IOU_LOOP_CONTINUE = 0, 17 + IOU_LOOP_STOP, 18 + }; 19 + 20 + static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) 21 + { 22 + return data_race(ctx->loop_step); 23 + } 24 + 25 + int io_run_loop(struct io_ring_ctx *ctx); 26 + 27 + #endif
+1 -1
io_uring/msg_ring.c
··· 67 67 68 68 static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) 69 69 { 70 - return target_ctx->task_complete; 70 + return target_ctx->int_flags & IO_RING_F_TASK_COMPLETE; 71 71 } 72 72 73 73 static void io_msg_tw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
+60 -102
io_uring/net.c
··· 375 375 kmsg->msg.msg_namelen = addr_len; 376 376 } 377 377 if (sr->flags & IORING_RECVSEND_FIXED_BUF) { 378 - if (sr->flags & IORING_SEND_VECTORIZED) 379 - return -EINVAL; 380 - req->flags |= REQ_F_IMPORT_BUFFER; 381 - return 0; 378 + if (!(sr->flags & IORING_SEND_VECTORIZED)) { 379 + req->flags |= REQ_F_IMPORT_BUFFER; 380 + return 0; 381 + } 382 + 383 + kmsg->msg.msg_iter.nr_segs = sr->len; 384 + return io_prep_reg_iovec(req, &kmsg->vec, sr->buf, sr->len); 382 385 } 383 386 if (req->flags & REQ_F_BUFFER_SELECT) 384 387 return 0; ··· 399 396 struct user_msghdr msg; 400 397 int ret; 401 398 399 + sr->flags |= IORING_SEND_VECTORIZED; 402 400 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 403 401 ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); 404 402 if (unlikely(ret)) ··· 1337 1333 struct io_ring_ctx *ctx = req->ctx; 1338 1334 struct io_async_msghdr *iomsg; 1339 1335 struct io_kiocb *notif; 1336 + u64 user_data; 1340 1337 int ret; 1341 1338 1342 1339 zc->done_io = 0; 1343 1340 1344 - if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) 1341 + if (unlikely(READ_ONCE(sqe->__pad2[0]))) 1345 1342 return -EINVAL; 1346 1343 /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ 1347 1344 if (req->flags & REQ_F_CQE_SKIP) ··· 1351 1346 notif = zc->notif = io_alloc_notif(ctx); 1352 1347 if (!notif) 1353 1348 return -ENOMEM; 1354 - notif->cqe.user_data = req->cqe.user_data; 1349 + user_data = READ_ONCE(sqe->addr3); 1350 + if (!user_data) 1351 + user_data = req->cqe.user_data; 1352 + 1353 + notif->cqe.user_data = user_data; 1355 1354 notif->cqe.res = 0; 1356 1355 notif->cqe.flags = IORING_CQE_F_NOTIF; 1357 1356 req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; ··· 1379 1370 if (zc->msg_flags & MSG_DONTWAIT) 1380 1371 req->flags |= REQ_F_NOWAIT; 1381 1372 1382 - if (io_is_compat(req->ctx)) 1373 + if (io_is_compat(ctx)) 1383 1374 zc->msg_flags |= MSG_CMSG_COMPAT; 1384 1375 1385 1376 iomsg = io_msg_alloc_async(req); ··· 1454 1445 return ret; 1455 1446 } 1456 1447 1457 - static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) 1448 + static int io_send_zc_import(struct io_kiocb *req, 1449 + struct io_async_msghdr *kmsg, 1450 + unsigned int issue_flags) 1458 1451 { 1459 1452 struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1460 - struct io_async_msghdr *kmsg = req->async_data; 1453 + struct io_kiocb *notif = sr->notif; 1454 + int ret; 1461 1455 1462 1456 WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); 1463 1457 1464 - sr->notif->buf_index = req->buf_index; 1465 - return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, 1466 - (u64)(uintptr_t)sr->buf, sr->len, 1467 - ITER_SOURCE, issue_flags); 1458 + notif->buf_index = req->buf_index; 1459 + 1460 + if (!(sr->flags & IORING_SEND_VECTORIZED)) { 1461 + ret = io_import_reg_buf(notif, &kmsg->msg.msg_iter, 1462 + (u64)(uintptr_t)sr->buf, sr->len, 1463 + ITER_SOURCE, issue_flags); 1464 + } else { 1465 + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; 1466 + 1467 + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, 1468 + notif, &kmsg->vec, uvec_segs, 1469 + issue_flags); 1470 + } 1471 + 1472 + if (unlikely(ret)) 1473 + return ret; 1474 + req->flags &= ~REQ_F_IMPORT_BUFFER; 1475 + return 0; 1468 1476 } 1469 1477 1470 - int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) 1478 + int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1471 1479 { 1472 - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); 1480 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1473 1481 struct io_async_msghdr *kmsg = req->async_data; 1474 1482 struct socket *sock; 1475 1483 unsigned msg_flags; ··· 1497 1471 return -ENOTSOCK; 1498 1472 if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1499 1473 return -EOPNOTSUPP; 1500 - 1501 - if (!(req->flags & REQ_F_POLLED) && 1502 - (zc->flags & IORING_RECVSEND_POLL_FIRST)) 1503 - return -EAGAIN; 1504 - 1505 - if (req->flags & REQ_F_IMPORT_BUFFER) { 1506 - req->flags &= ~REQ_F_IMPORT_BUFFER; 1507 - ret = io_send_zc_import(req, issue_flags); 1508 - if (unlikely(ret)) 1509 - return ret; 1510 - } 1511 - 1512 - msg_flags = zc->msg_flags; 1513 - if (issue_flags & IO_URING_F_NONBLOCK) 1514 - msg_flags |= MSG_DONTWAIT; 1515 - if (msg_flags & MSG_WAITALL) 1516 - min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1517 - msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1518 - 1519 - kmsg->msg.msg_flags = msg_flags; 1520 - kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; 1521 - ret = sock_sendmsg(sock, &kmsg->msg); 1522 - 1523 - if (unlikely(ret < min_ret)) { 1524 - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1525 - return -EAGAIN; 1526 - 1527 - if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { 1528 - zc->done_io += ret; 1529 - return -EAGAIN; 1530 - } 1531 - if (ret == -ERESTARTSYS) 1532 - ret = -EINTR; 1533 - req_set_fail(req); 1534 - } 1535 - 1536 - if (ret >= 0) 1537 - ret += zc->done_io; 1538 - else if (zc->done_io) 1539 - ret = zc->done_io; 1540 - 1541 - /* 1542 - * If we're in io-wq we can't rely on tw ordering guarantees, defer 1543 - * flushing notif to io_send_zc_cleanup() 1544 - */ 1545 - if (!(issue_flags & IO_URING_F_UNLOCKED)) { 1546 - io_notif_flush(zc->notif); 1547 - zc->notif = NULL; 1548 - io_req_msg_cleanup(req, 0); 1549 - } 1550 - io_req_set_res(req, ret, IORING_CQE_F_MORE); 1551 - return IOU_COMPLETE; 1552 - } 1553 - 1554 - int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) 1555 - { 1556 - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 1557 - struct io_async_msghdr *kmsg = req->async_data; 1558 - struct socket *sock; 1559 - unsigned flags; 1560 - int ret, min_ret = 0; 1561 - 1562 - if (req->flags & REQ_F_IMPORT_BUFFER) { 1563 - unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; 1564 - int ret; 1565 - 1566 - sr->notif->buf_index = req->buf_index; 1567 - ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, 1568 - sr->notif, &kmsg->vec, uvec_segs, 1569 - issue_flags); 1570 - if (unlikely(ret)) 1571 - return ret; 1572 - req->flags &= ~REQ_F_IMPORT_BUFFER; 1573 - } 1574 - 1575 - sock = sock_from_file(req->file); 1576 - if (unlikely(!sock)) 1577 - return -ENOTSOCK; 1578 - if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) 1579 - return -EOPNOTSUPP; 1580 - 1581 1474 if (!(req->flags & REQ_F_POLLED) && 1582 1475 (sr->flags & IORING_RECVSEND_POLL_FIRST)) 1583 1476 return -EAGAIN; 1584 1477 1585 - flags = sr->msg_flags; 1478 + if (req->flags & REQ_F_IMPORT_BUFFER) { 1479 + ret = io_send_zc_import(req, kmsg, issue_flags); 1480 + if (unlikely(ret)) 1481 + return ret; 1482 + } 1483 + 1484 + msg_flags = sr->msg_flags; 1586 1485 if (issue_flags & IO_URING_F_NONBLOCK) 1587 - flags |= MSG_DONTWAIT; 1588 - if (flags & MSG_WAITALL) 1486 + msg_flags |= MSG_DONTWAIT; 1487 + if (msg_flags & MSG_WAITALL) 1589 1488 min_ret = iov_iter_count(&kmsg->msg.msg_iter); 1590 1489 1591 - kmsg->msg.msg_control_user = sr->msg_control; 1592 1490 kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; 1593 - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); 1491 + 1492 + if (req->opcode == IORING_OP_SEND_ZC) { 1493 + msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; 1494 + kmsg->msg.msg_flags = msg_flags; 1495 + ret = sock_sendmsg(sock, &kmsg->msg); 1496 + } else { 1497 + kmsg->msg.msg_control_user = sr->msg_control; 1498 + ret = __sys_sendmsg_sock(sock, &kmsg->msg, msg_flags); 1499 + } 1594 1500 1595 1501 if (unlikely(ret < min_ret)) { 1596 1502 if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) 1597 1503 return -EAGAIN; 1598 1504 1599 - if (ret > 0 && io_net_retry(sock, flags)) { 1505 + if (ret > 0 && io_net_retry(sock, sr->msg_flags)) { 1600 1506 sr->done_io += ret; 1601 1507 return -EAGAIN; 1602 1508 }
-1
io_uring/net.h
··· 50 50 int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 51 51 int io_connect(struct io_kiocb *req, unsigned int issue_flags); 52 52 53 - int io_send_zc(struct io_kiocb *req, unsigned int issue_flags); 54 53 int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); 55 54 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); 56 55 void io_send_zc_cleanup(struct io_kiocb *req);
+1 -11
io_uring/opdef.c
··· 67 67 .audit_skip = 1, 68 68 .ioprio = 1, 69 69 .iopoll = 1, 70 - .iopoll_queue = 1, 71 70 .vectored = 1, 72 71 .async_size = sizeof(struct io_async_rw), 73 72 .prep = io_prep_readv, ··· 81 82 .audit_skip = 1, 82 83 .ioprio = 1, 83 84 .iopoll = 1, 84 - .iopoll_queue = 1, 85 85 .vectored = 1, 86 86 .async_size = sizeof(struct io_async_rw), 87 87 .prep = io_prep_writev, ··· 100 102 .audit_skip = 1, 101 103 .ioprio = 1, 102 104 .iopoll = 1, 103 - .iopoll_queue = 1, 104 105 .async_size = sizeof(struct io_async_rw), 105 106 .prep = io_prep_read_fixed, 106 107 .issue = io_read_fixed, ··· 113 116 .audit_skip = 1, 114 117 .ioprio = 1, 115 118 .iopoll = 1, 116 - .iopoll_queue = 1, 117 119 .async_size = sizeof(struct io_async_rw), 118 120 .prep = io_prep_write_fixed, 119 121 .issue = io_write_fixed, ··· 246 250 .audit_skip = 1, 247 251 .ioprio = 1, 248 252 .iopoll = 1, 249 - .iopoll_queue = 1, 250 253 .async_size = sizeof(struct io_async_rw), 251 254 .prep = io_prep_read, 252 255 .issue = io_read, ··· 259 264 .audit_skip = 1, 260 265 .ioprio = 1, 261 266 .iopoll = 1, 262 - .iopoll_queue = 1, 263 267 .async_size = sizeof(struct io_async_rw), 264 268 .prep = io_prep_write, 265 269 .issue = io_write, ··· 417 423 .needs_file = 1, 418 424 .plug = 1, 419 425 .iopoll = 1, 420 - .iopoll_queue = 1, 421 426 .async_size = sizeof(struct io_async_cmd), 422 427 .prep = io_uring_cmd_prep, 423 428 .issue = io_uring_cmd, ··· 430 437 #if defined(CONFIG_NET) 431 438 .async_size = sizeof(struct io_async_msghdr), 432 439 .prep = io_send_zc_prep, 433 - .issue = io_send_zc, 440 + .issue = io_sendmsg_zc, 434 441 #else 435 442 .prep = io_eopnotsupp_prep, 436 443 #endif ··· 549 556 .audit_skip = 1, 550 557 .ioprio = 1, 551 558 .iopoll = 1, 552 - .iopoll_queue = 1, 553 559 .vectored = 1, 554 560 .async_size = sizeof(struct io_async_rw), 555 561 .prep = io_prep_readv_fixed, ··· 563 571 .audit_skip = 1, 564 572 .ioprio = 1, 565 573 .iopoll = 1, 566 - .iopoll_queue = 1, 567 574 .vectored = 1, 568 575 .async_size = sizeof(struct io_async_rw), 569 576 .prep = io_prep_writev_fixed, ··· 584 593 .needs_file = 1, 585 594 .plug = 1, 586 595 .iopoll = 1, 587 - .iopoll_queue = 1, 588 596 .is_128 = 1, 589 597 .async_size = sizeof(struct io_async_cmd), 590 598 .prep = io_uring_cmd_prep,
-2
io_uring/opdef.h
··· 25 25 unsigned poll_exclusive : 1; 26 26 /* skip auditing */ 27 27 unsigned audit_skip : 1; 28 - /* have to be put into the iopoll list */ 29 - unsigned iopoll_queue : 1; 30 28 /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ 31 29 unsigned vectored : 1; 32 30 /* set to 1 if this opcode uses 128b sqes in a mixed sq */
+5 -3
io_uring/poll.c
··· 277 277 278 278 /* the mask was stashed in __io_poll_execute */ 279 279 if (!req->cqe.res) { 280 - struct poll_table_struct pt = { ._key = req->apoll_events }; 281 - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; 280 + __poll_t events = req->apoll_events; 281 + struct poll_table_struct pt = { ._key = events }; 282 + 283 + req->cqe.res = vfs_poll(req->file, &pt) & events; 282 284 /* 283 285 * We got woken with a mask, but someone else got to 284 286 * it first. The above vfs_poll() doesn't add us back ··· 289 287 */ 290 288 if (unlikely(!req->cqe.res)) { 291 289 /* Multishot armed need not reissue */ 292 - if (!(req->apoll_events & EPOLLONESHOT)) 290 + if (!(events & EPOLLONESHOT)) 293 291 continue; 294 292 return IOU_POLL_REISSUE; 295 293 }
+2 -2
io_uring/query.c
··· 34 34 { 35 35 struct io_uring_query_zcrx *e = &data->zcrx; 36 36 37 - e->register_flags = ZCRX_REG_IMPORT; 37 + e->register_flags = ZCRX_SUPPORTED_REG_FLAGS; 38 38 e->area_flags = IORING_ZCRX_AREA_DMABUF; 39 39 e->nr_ctrl_opcodes = __ZCRX_CTRL_LAST; 40 40 e->rq_hdr_size = sizeof(struct io_uring); 41 41 e->rq_hdr_alignment = L1_CACHE_BYTES; 42 - e->features = ZCRX_FEATURE_RX_PAGE_SIZE; 42 + e->features = ZCRX_FEATURES; 43 43 e->__resv2 = 0; 44 44 return sizeof(*e); 45 45 }
+8 -41
io_uring/register.c
··· 192 192 return ret; 193 193 } 194 194 if (ctx->restrictions.op_registered) 195 - ctx->op_restricted = 1; 195 + ctx->int_flags |= IO_RING_F_OP_RESTRICTED; 196 196 if (ctx->restrictions.reg_registered) 197 - ctx->reg_restricted = 1; 197 + ctx->int_flags |= IO_RING_F_REG_RESTRICTED; 198 198 return 0; 199 199 } 200 200 ··· 392 392 for (i = 0; i < ARRAY_SIZE(new_count); i++) 393 393 if (new_count[i]) 394 394 ctx->iowq_limits[i] = new_count[i]; 395 - ctx->iowq_limits_set = true; 395 + ctx->int_flags |= IO_RING_F_IOWQ_LIMITS_SET; 396 396 397 397 if (tctx && tctx->io_wq) { 398 398 ret = io_wq_max_workers(tctx->io_wq, new_count); ··· 733 733 if (ctx->submitter_task && ctx->submitter_task != current) 734 734 return -EEXIST; 735 735 736 - if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 736 + if ((ctx->int_flags & IO_RING_F_REG_RESTRICTED) && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 737 737 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 738 738 if (!test_bit(opcode, ctx->restrictions.register_op)) 739 739 return -EACCES; ··· 908 908 ret = -EINVAL; 909 909 if (!arg || nr_args != 1) 910 910 break; 911 - ret = io_register_zcrx_ifq(ctx, arg); 911 + ret = io_register_zcrx(ctx, arg); 912 912 break; 913 913 case IORING_REGISTER_RESIZE_RINGS: 914 914 ret = -EINVAL; ··· 944 944 } 945 945 946 946 return ret; 947 - } 948 - 949 - /* 950 - * Given an 'fd' value, return the ctx associated with if. If 'registered' is 951 - * true, then the registered index is used. Otherwise, the normal fd table. 952 - * Caller must call fput() on the returned file, unless it's an ERR_PTR. 953 - */ 954 - struct file *io_uring_register_get_file(unsigned int fd, bool registered) 955 - { 956 - struct file *file; 957 - 958 - if (registered) { 959 - /* 960 - * Ring fd has been registered via IORING_REGISTER_RING_FDS, we 961 - * need only dereference our task private array to find it. 962 - */ 963 - struct io_uring_task *tctx = current->io_uring; 964 - 965 - if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) 966 - return ERR_PTR(-EINVAL); 967 - fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); 968 - file = tctx->registered_rings[fd]; 969 - if (file) 970 - get_file(file); 971 - } else { 972 - file = fget(fd); 973 - } 974 - 975 - if (unlikely(!file)) 976 - return ERR_PTR(-EBADF); 977 - if (io_is_uring_fops(file)) 978 - return file; 979 - fput(file); 980 - return ERR_PTR(-EOPNOTSUPP); 981 947 } 982 948 983 949 static int io_uring_register_send_msg_ring(void __user *arg, unsigned int nr_args) ··· 1000 1034 if (fd == -1) 1001 1035 return io_uring_register_blind(opcode, arg, nr_args); 1002 1036 1003 - file = io_uring_register_get_file(fd, use_registered_ring); 1037 + file = io_uring_ctx_get_file(fd, use_registered_ring); 1004 1038 if (IS_ERR(file)) 1005 1039 return PTR_ERR(file); 1006 1040 ctx = file->private_data; ··· 1012 1046 ctx->buf_table.nr, ret); 1013 1047 mutex_unlock(&ctx->uring_lock); 1014 1048 1015 - fput(file); 1049 + if (!use_registered_ring) 1050 + fput(file); 1016 1051 return ret; 1017 1052 }
-1
io_uring/register.h
··· 4 4 5 5 int io_eventfd_unregister(struct io_ring_ctx *ctx); 6 6 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); 7 - struct file *io_uring_register_get_file(unsigned int fd, bool registered); 8 7 9 8 #endif
+8 -7
io_uring/rsrc.c
··· 295 295 u64 tag = 0; 296 296 297 297 uvec = u64_to_user_ptr(user_data); 298 - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 298 + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); 299 299 if (IS_ERR(iov)) { 300 300 err = PTR_ERR(iov); 301 301 break; ··· 319 319 i = array_index_nospec(up->offset + done, ctx->buf_table.nr); 320 320 io_reset_rsrc_node(ctx, &ctx->buf_table, i); 321 321 ctx->buf_table.nodes[i] = node; 322 - if (ctx->compat) 322 + if (io_is_compat(ctx)) 323 323 user_data += sizeof(struct compat_iovec); 324 324 else 325 325 user_data += sizeof(struct iovec); ··· 883 883 884 884 if (arg) { 885 885 uvec = (struct iovec __user *) arg; 886 - iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); 886 + iov = iovec_from_user(uvec, 1, 1, &fast_iov, io_is_compat(ctx)); 887 887 if (IS_ERR(iov)) { 888 888 ret = PTR_ERR(iov); 889 889 break; 890 890 } 891 - if (ctx->compat) 891 + if (io_is_compat(ctx)) 892 892 arg += sizeof(struct compat_iovec); 893 893 else 894 894 arg += sizeof(struct iovec); ··· 961 961 */ 962 962 imu = io_alloc_imu(ctx, blk_rq_nr_phys_segments(rq)); 963 963 if (!imu) { 964 - kfree(node); 964 + io_cache_free(&ctx->node_cache, node); 965 965 ret = -ENOMEM; 966 966 goto unlock; 967 967 } ··· 1273 1273 return -EINVAL; 1274 1274 1275 1275 registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0; 1276 - file = io_uring_register_get_file(buf.src_fd, registered_src); 1276 + file = io_uring_ctx_get_file(buf.src_fd, registered_src); 1277 1277 if (IS_ERR(file)) 1278 1278 return PTR_ERR(file); 1279 1279 ··· 1295 1295 if (src_ctx != ctx) 1296 1296 mutex_unlock(&src_ctx->uring_lock); 1297 1297 1298 - fput(file); 1298 + if (!registered_src) 1299 + fput(file); 1299 1300 return ret; 1300 1301 } 1301 1302
+12 -12
io_uring/rw.c
··· 504 504 if (!S_ISBLK(mode) && !S_ISREG(mode)) 505 505 return false; 506 506 if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 507 - !(ctx->flags & IORING_SETUP_IOPOLL))) 507 + !(req->flags & REQ_F_IOPOLL))) 508 508 return false; 509 509 /* 510 510 * If ref is dying, we might be running poll reap from the exit work. ··· 640 640 } 641 641 } 642 642 643 - if (req->ctx->flags & IORING_SETUP_IOPOLL) 643 + if (req->flags & REQ_F_IOPOLL) 644 644 io_complete_rw_iopoll(&rw->kiocb, ret); 645 645 else 646 646 io_complete_rw(&rw->kiocb, ret); ··· 654 654 655 655 if (ret >= 0 && req->flags & REQ_F_CUR_POS) 656 656 req->file->f_pos = rw->kiocb.ki_pos; 657 - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { 657 + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { 658 658 u32 cflags = 0; 659 659 660 660 __io_complete_rw_common(req, ret); ··· 876 876 if (ctx->flags & IORING_SETUP_IOPOLL) { 877 877 if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) 878 878 return -EOPNOTSUPP; 879 + req->flags |= REQ_F_IOPOLL; 879 880 kiocb->private = NULL; 880 881 kiocb->ki_flags |= IOCB_HIPRI; 881 882 req->iopoll_completed = 0; ··· 900 899 * We have a union of meta fields with wpq used for buffered-io 901 900 * in io_async_rw, so fail it here. 902 901 */ 903 - if (!(req->file->f_flags & O_DIRECT)) 902 + if (!(file->f_flags & O_DIRECT)) 904 903 return -EOPNOTSUPP; 905 904 kiocb->ki_flags |= IOCB_HAS_METADATA; 906 905 kiocb->private = &io->meta; ··· 962 961 if (ret == -EAGAIN) { 963 962 /* If we can poll, just do that. */ 964 963 if (io_file_can_poll(req)) 965 - return -EAGAIN; 964 + return ret; 966 965 /* IOPOLL retry should happen for io-wq threads */ 967 - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) 968 - goto done; 966 + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) 967 + return ret; 969 968 /* no retry on NONBLOCK nor RWF_NOWAIT */ 970 969 if (req->flags & REQ_F_NOWAIT) 971 - goto done; 970 + return ret; 972 971 ret = 0; 973 972 } else if (ret == -EIOCBQUEUED) { 974 973 return IOU_ISSUE_SKIP_COMPLETE; ··· 976 975 (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || 977 976 (issue_flags & IO_URING_F_MULTISHOT)) { 978 977 /* read all, failed, already did sync or don't want to retry */ 979 - goto done; 978 + return ret; 980 979 } 981 980 982 981 /* ··· 1019 1018 kiocb->ki_flags &= ~IOCB_WAITQ; 1020 1019 iov_iter_restore(&io->iter, &io->iter_state); 1021 1020 } while (ret > 0); 1022 - done: 1023 - /* it's faster to check here than delegate to kfree */ 1021 + 1024 1022 return ret; 1025 1023 } 1026 1024 ··· 1188 1188 goto done; 1189 1189 if (!force_nonblock || ret2 != -EAGAIN) { 1190 1190 /* IOPOLL retry should happen for io-wq threads */ 1191 - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) 1191 + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) 1192 1192 goto ret_eagain; 1193 1193 1194 1194 if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
+7 -1
io_uring/sqpoll.c
··· 458 458 return -EINVAL; 459 459 } 460 460 if (ctx->flags & IORING_SETUP_SQPOLL) { 461 + struct io_uring_task *tctx; 461 462 struct task_struct *tsk; 462 463 struct io_sq_data *sqd; 463 464 bool attached; ··· 525 524 rcu_assign_pointer(sqd->thread, tsk); 526 525 mutex_unlock(&sqd->lock); 527 526 527 + ret = 0; 528 528 get_task_struct(tsk); 529 - ret = io_uring_alloc_task_context(tsk, ctx); 529 + tctx = io_uring_alloc_task_context(tsk, ctx); 530 + if (!IS_ERR(tctx)) 531 + tsk->io_uring = tctx; 532 + else 533 + ret = PTR_ERR(tctx); 530 534 wake_up_new_task(tsk); 531 535 if (ret) 532 536 goto err;
+49 -30
io_uring/tctx.c
··· 74 74 } 75 75 } 76 76 77 - __cold int io_uring_alloc_task_context(struct task_struct *task, 78 - struct io_ring_ctx *ctx) 77 + __cold struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, 78 + struct io_ring_ctx *ctx) 79 79 { 80 80 struct io_uring_task *tctx; 81 81 int ret; 82 82 83 83 tctx = kzalloc_obj(*tctx); 84 84 if (unlikely(!tctx)) 85 - return -ENOMEM; 85 + return ERR_PTR(-ENOMEM); 86 86 87 87 ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); 88 88 if (unlikely(ret)) { 89 89 kfree(tctx); 90 - return ret; 90 + return ERR_PTR(ret); 91 91 } 92 92 93 93 tctx->io_wq = io_init_wq_offload(ctx, task); ··· 95 95 ret = PTR_ERR(tctx->io_wq); 96 96 percpu_counter_destroy(&tctx->inflight); 97 97 kfree(tctx); 98 - return ret; 98 + return ERR_PTR(ret); 99 99 } 100 100 101 101 tctx->task = task; ··· 103 103 init_waitqueue_head(&tctx->wait); 104 104 atomic_set(&tctx->in_cancel, 0); 105 105 atomic_set(&tctx->inflight_tracked, 0); 106 - task->io_uring = tctx; 107 106 init_llist_head(&tctx->task_list); 108 107 init_task_work(&tctx->task_work, tctx_task_work); 108 + return tctx; 109 + } 110 + 111 + static int io_tctx_install_node(struct io_ring_ctx *ctx, 112 + struct io_uring_task *tctx) 113 + { 114 + struct io_tctx_node *node; 115 + int ret; 116 + 117 + if (xa_load(&tctx->xa, (unsigned long)ctx)) 118 + return 0; 119 + 120 + node = kmalloc_obj(*node); 121 + if (!node) 122 + return -ENOMEM; 123 + node->ctx = ctx; 124 + node->task = current; 125 + 126 + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 127 + node, GFP_KERNEL)); 128 + if (ret) { 129 + kfree(node); 130 + return ret; 131 + } 132 + 133 + mutex_lock(&ctx->tctx_lock); 134 + list_add(&node->ctx_node, &ctx->tctx_list); 135 + mutex_unlock(&ctx->tctx_lock); 109 136 return 0; 110 137 } 111 138 112 139 int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) 113 140 { 114 141 struct io_uring_task *tctx = current->io_uring; 115 - struct io_tctx_node *node; 116 142 int ret; 117 143 118 144 if (unlikely(!tctx)) { 119 - ret = io_uring_alloc_task_context(current, ctx); 120 - if (unlikely(ret)) 121 - return ret; 145 + tctx = io_uring_alloc_task_context(current, ctx); 146 + if (IS_ERR(tctx)) 147 + return PTR_ERR(tctx); 122 148 123 - tctx = current->io_uring; 124 - if (ctx->iowq_limits_set) { 149 + if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) { 125 150 unsigned int limits[2] = { ctx->iowq_limits[0], 126 151 ctx->iowq_limits[1], }; 127 152 128 153 ret = io_wq_max_workers(tctx->io_wq, limits); 129 154 if (ret) 130 - return ret; 155 + goto err_free; 131 156 } 132 157 } 133 158 ··· 163 138 */ 164 139 if (tctx->io_wq) 165 140 io_wq_set_exit_on_idle(tctx->io_wq, false); 166 - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 167 - node = kmalloc_obj(*node); 168 - if (!node) 169 - return -ENOMEM; 170 - node->ctx = ctx; 171 - node->task = current; 172 141 173 - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, 174 - node, GFP_KERNEL)); 175 - if (ret) { 176 - kfree(node); 177 - return ret; 178 - } 179 - 180 - mutex_lock(&ctx->tctx_lock); 181 - list_add(&node->ctx_node, &ctx->tctx_list); 182 - mutex_unlock(&ctx->tctx_lock); 142 + ret = io_tctx_install_node(ctx, tctx); 143 + if (!ret) { 144 + current->io_uring = tctx; 145 + return 0; 183 146 } 184 - return 0; 147 + if (!current->io_uring) { 148 + err_free: 149 + io_wq_put_and_exit(tctx->io_wq); 150 + percpu_counter_destroy(&tctx->inflight); 151 + kfree(tctx); 152 + } 153 + return ret; 185 154 } 186 155 187 156 int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx)
+2 -2
io_uring/tctx.h
··· 6 6 struct io_ring_ctx *ctx; 7 7 }; 8 8 9 - int io_uring_alloc_task_context(struct task_struct *task, 10 - struct io_ring_ctx *ctx); 9 + struct io_uring_task *io_uring_alloc_task_context(struct task_struct *task, 10 + struct io_ring_ctx *ctx); 11 11 void io_uring_del_tctx_node(unsigned long index); 12 12 int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); 13 13 int __io_uring_add_tctx_node_from_submit(struct io_ring_ctx *ctx);
+51 -27
io_uring/timeout.c
··· 30 30 u64 addr; 31 31 32 32 /* timeout update */ 33 - struct timespec64 ts; 33 + ktime_t time; 34 34 u32 flags; 35 35 bool ltimeout; 36 36 }; 37 + 38 + static int io_parse_user_time(ktime_t *time, u64 arg, unsigned flags) 39 + { 40 + struct timespec64 ts; 41 + 42 + if (flags & IORING_TIMEOUT_IMMEDIATE_ARG) { 43 + *time = ns_to_ktime(arg); 44 + if (*time < 0) 45 + return -EINVAL; 46 + return 0; 47 + } 48 + 49 + if (get_timespec64(&ts, u64_to_user_ptr(arg))) 50 + return -EFAULT; 51 + if (ts.tv_sec < 0 || ts.tv_nsec < 0) 52 + return -EINVAL; 53 + *time = timespec64_to_ktime(ts); 54 + return 0; 55 + } 37 56 38 57 static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, 39 58 struct io_kiocb *link); ··· 99 80 /* re-arm timer */ 100 81 raw_spin_lock_irq(&ctx->timeout_lock); 101 82 list_add(&timeout->list, ctx->timeout_list.prev); 102 - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 83 + hrtimer_start(&data->timer, data->time, data->mode); 103 84 raw_spin_unlock_irq(&ctx->timeout_lock); 104 85 return; 105 86 } ··· 284 265 285 266 raw_spin_lock_irqsave(&ctx->timeout_lock, flags); 286 267 list_del_init(&timeout->list); 287 - atomic_set(&req->ctx->cq_timeouts, 288 - atomic_read(&req->ctx->cq_timeouts) + 1); 268 + atomic_set(&ctx->cq_timeouts, 269 + atomic_read(&ctx->cq_timeouts) + 1); 289 270 raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); 290 271 291 272 if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) ··· 414 395 } 415 396 416 397 static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 417 - struct timespec64 *ts, enum hrtimer_mode mode) 398 + ktime_t ts, enum hrtimer_mode mode) 418 399 __must_hold(&ctx->timeout_lock) 419 400 { 420 401 struct io_timeout_data *io; ··· 436 417 if (hrtimer_try_to_cancel(&io->timer) == -1) 437 418 return -EALREADY; 438 419 hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); 439 - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); 420 + hrtimer_start(&io->timer, ts, mode); 440 421 return 0; 441 422 } 442 423 443 424 static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, 444 - struct timespec64 *ts, enum hrtimer_mode mode) 425 + ktime_t time, enum hrtimer_mode mode) 445 426 __must_hold(&ctx->timeout_lock) 446 427 { 447 428 struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; ··· 454 435 455 436 timeout->off = 0; /* noseq */ 456 437 data = req->async_data; 457 - data->ts = *ts; 438 + data->time = time; 458 439 459 440 list_add_tail(&timeout->list, &ctx->timeout_list); 460 441 hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); 461 - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); 442 + hrtimer_start(&data->timer, data->time, mode); 462 443 return 0; 463 444 } 464 445 465 446 int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) 466 447 { 467 448 struct io_timeout_rem *tr = io_kiocb_to_cmd(req, struct io_timeout_rem); 449 + int ret; 468 450 469 451 if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) 452 + return -EINVAL; 453 + if (sqe->addr3 || sqe->__pad2[0]) 470 454 return -EINVAL; 471 455 if (sqe->buf_index || sqe->len || sqe->splice_fd_in) 472 456 return -EINVAL; ··· 482 460 return -EINVAL; 483 461 if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) 484 462 tr->ltimeout = true; 485 - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) 463 + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK | 464 + IORING_TIMEOUT_ABS | 465 + IORING_TIMEOUT_IMMEDIATE_ARG)) 486 466 return -EINVAL; 487 - if (get_timespec64(&tr->ts, u64_to_user_ptr(READ_ONCE(sqe->addr2)))) 488 - return -EFAULT; 489 - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) 490 - return -EINVAL; 467 + ret = io_parse_user_time(&tr->time, READ_ONCE(sqe->addr2), tr->flags); 468 + if (ret) 469 + return ret; 491 470 } else if (tr->flags) { 492 471 /* timeout removal doesn't support flags */ 493 472 return -EINVAL; ··· 523 500 524 501 raw_spin_lock_irq(&ctx->timeout_lock); 525 502 if (tr->ltimeout) 526 - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); 503 + ret = io_linked_timeout_update(ctx, tr->addr, tr->time, mode); 527 504 else 528 - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); 505 + ret = io_timeout_update(ctx, tr->addr, tr->time, mode); 529 506 raw_spin_unlock_irq(&ctx->timeout_lock); 530 507 } 531 508 ··· 543 520 struct io_timeout_data *data; 544 521 unsigned flags; 545 522 u32 off = READ_ONCE(sqe->off); 523 + int ret; 546 524 525 + if (sqe->addr3 || sqe->__pad2[0]) 526 + return -EINVAL; 547 527 if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) 548 528 return -EINVAL; 549 529 if (off && is_timeout_link) ··· 554 528 flags = READ_ONCE(sqe->timeout_flags); 555 529 if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | 556 530 IORING_TIMEOUT_ETIME_SUCCESS | 557 - IORING_TIMEOUT_MULTISHOT)) 531 + IORING_TIMEOUT_MULTISHOT | 532 + IORING_TIMEOUT_IMMEDIATE_ARG)) 558 533 return -EINVAL; 559 534 /* more than one clock specified is invalid, obviously */ 560 535 if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) ··· 566 539 567 540 INIT_LIST_HEAD(&timeout->list); 568 541 timeout->off = off; 569 - if (unlikely(off && !req->ctx->off_timeout_used)) 570 - req->ctx->off_timeout_used = true; 542 + if (unlikely(off && !(req->ctx->int_flags & IO_RING_F_OFF_TIMEOUT_USED))) 543 + req->ctx->int_flags |= IO_RING_F_OFF_TIMEOUT_USED; 571 544 /* 572 545 * for multishot reqs w/ fixed nr of repeats, repeats tracks the 573 546 * remaining nr ··· 584 557 data->req = req; 585 558 data->flags = flags; 586 559 587 - if (get_timespec64(&data->ts, u64_to_user_ptr(READ_ONCE(sqe->addr)))) 588 - return -EFAULT; 589 - 590 - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) 591 - return -EINVAL; 560 + ret = io_parse_user_time(&data->time, READ_ONCE(sqe->addr), flags); 561 + if (ret) 562 + return ret; 592 563 593 564 data->mode = io_translate_timeout_mode(flags); 594 565 ··· 662 637 } 663 638 add: 664 639 list_add(&timeout->list, entry); 665 - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); 640 + hrtimer_start(&data->timer, data->time, data->mode); 666 641 raw_spin_unlock_irq(&ctx->timeout_lock); 667 642 return IOU_ISSUE_SKIP_COMPLETE; 668 643 } ··· 680 655 if (timeout->head) { 681 656 struct io_timeout_data *data = req->async_data; 682 657 683 - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), 684 - data->mode); 658 + hrtimer_start(&data->timer, data->time, data->mode); 685 659 list_add_tail(&timeout->list, &ctx->ltimeout_list); 686 660 } 687 661 raw_spin_unlock_irq(&ctx->timeout_lock);
+1 -1
io_uring/timeout.h
··· 3 3 struct io_timeout_data { 4 4 struct io_kiocb *req; 5 5 struct hrtimer timer; 6 - struct timespec64 ts; 6 + ktime_t time; 7 7 enum hrtimer_mode mode; 8 8 u32 flags; 9 9 };
+1 -1
io_uring/tw.c
··· 222 222 223 223 if (!head) { 224 224 io_ctx_mark_taskrun(ctx); 225 - if (ctx->has_evfd) 225 + if (data_race(ctx->int_flags) & IO_RING_F_HAS_EVFD) 226 226 io_eventfd_signal(ctx, false); 227 227 } 228 228
+4 -5
io_uring/uring_cmd.c
··· 110 110 * because iopoll completion data overlaps with the hash_node used 111 111 * for tracking. 112 112 */ 113 - if (ctx->flags & IORING_SETUP_IOPOLL) 113 + if (req->flags & REQ_F_IOPOLL) 114 114 return; 115 115 116 116 if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { ··· 167 167 io_req_set_cqe32_extra(req, res2, 0); 168 168 } 169 169 io_req_uring_cleanup(req, issue_flags); 170 - if (req->ctx->flags & IORING_SETUP_IOPOLL) { 170 + if (req->flags & REQ_F_IOPOLL) { 171 171 /* order with io_iopoll_req_issued() checking ->iopoll_complete */ 172 172 smp_store_release(&req->iopoll_completed, 1); 173 173 } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { ··· 257 257 issue_flags |= IO_URING_F_CQE32; 258 258 if (io_is_compat(ctx)) 259 259 issue_flags |= IO_URING_F_COMPAT; 260 - if (ctx->flags & IORING_SETUP_IOPOLL) { 261 - if (!file->f_op->uring_cmd_iopoll) 262 - return -EOPNOTSUPP; 260 + if (ctx->flags & IORING_SETUP_IOPOLL && file->f_op->uring_cmd_iopoll) { 261 + req->flags |= REQ_F_IOPOLL; 263 262 issue_flags |= IO_URING_F_IOPOLL; 264 263 req->iopoll_completed = 0; 265 264 if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+1
io_uring/wait.h
··· 25 25 struct ext_arg *ext_arg); 26 26 int io_run_task_work_sig(struct io_ring_ctx *ctx); 27 27 void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); 28 + void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); 28 29 29 30 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 30 31 {
+240 -152
io_uring/zcrx.c
··· 63 63 unsigned i; 64 64 65 65 for_each_sgtable_dma_sg(sgt, sg, i) 66 - shift = min(shift, __ffs(sg->length)); 66 + shift = min(shift, __ffs(sg_dma_len(sg))); 67 67 return shift; 68 68 } 69 69 ··· 127 127 int dmabuf_fd = area_reg->dmabuf_fd; 128 128 int i, ret; 129 129 130 + if (!ifq->dev) 131 + return -EINVAL; 130 132 if (off) 131 133 return -EINVAL; 132 - if (WARN_ON_ONCE(!ifq->dev)) 133 - return -EFAULT; 134 134 if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) 135 135 return -EINVAL; 136 136 ··· 194 194 { 195 195 struct page **pages; 196 196 int nr_pages, ret; 197 + bool mapped = false; 197 198 198 199 if (area_reg->dmabuf_fd) 199 200 return -EINVAL; ··· 208 207 ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, 209 208 0, (unsigned long)nr_pages << PAGE_SHIFT, 210 209 GFP_KERNEL_ACCOUNT); 211 - if (ret) { 212 - unpin_user_pages(pages, nr_pages); 213 - kvfree(pages); 214 - return ret; 210 + if (ret) 211 + goto out_err; 212 + 213 + if (ifq->dev) { 214 + ret = dma_map_sgtable(ifq->dev, &mem->page_sg_table, 215 + DMA_FROM_DEVICE, IO_DMA_ATTR); 216 + if (ret < 0) 217 + goto out_err; 218 + mapped = true; 215 219 } 216 220 217 221 mem->account_pages = io_count_account_pages(pages, nr_pages); 218 222 ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); 219 - if (ret < 0) 223 + if (ret < 0) { 220 224 mem->account_pages = 0; 225 + goto out_err; 226 + } 221 227 222 228 mem->sgt = &mem->page_sg_table; 223 229 mem->pages = pages; 224 230 mem->nr_folios = nr_pages; 225 231 mem->size = area_reg->len; 232 + return ret; 233 + out_err: 234 + if (mapped) 235 + dma_unmap_sgtable(ifq->dev, &mem->page_sg_table, 236 + DMA_FROM_DEVICE, IO_DMA_ATTR); 237 + sg_free_table(&mem->page_sg_table); 238 + unpin_user_pages(pages, nr_pages); 239 + kvfree(pages); 226 240 return ret; 227 241 } 228 242 ··· 289 273 return; 290 274 area->is_mapped = false; 291 275 292 - for (i = 0; i < area->nia.num_niovs; i++) 293 - net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 276 + if (area->nia.niovs) { 277 + for (i = 0; i < area->nia.num_niovs; i++) 278 + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); 279 + } 294 280 295 281 if (area->mem.is_dmabuf) { 296 282 io_release_dmabuf(&area->mem); ··· 302 284 } 303 285 } 304 286 305 - static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) 306 - { 307 - int ret; 308 - 309 - guard(mutex)(&ifq->pp_lock); 310 - if (area->is_mapped) 311 - return 0; 312 - 313 - if (!area->mem.is_dmabuf) { 314 - ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, 315 - DMA_FROM_DEVICE, IO_DMA_ATTR); 316 - if (ret < 0) 317 - return ret; 318 - } 319 - 320 - ret = io_populate_area_dma(ifq, area); 321 - if (ret && !area->mem.is_dmabuf) 322 - dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, 323 - DMA_FROM_DEVICE, IO_DMA_ATTR); 324 - if (ret == 0) 325 - area->is_mapped = true; 326 - return ret; 327 - } 328 - 329 - static void io_zcrx_sync_for_device(struct page_pool *pool, 330 - struct net_iov *niov) 287 + static void zcrx_sync_for_device(struct page_pool *pp, struct io_zcrx_ifq *zcrx, 288 + netmem_ref *netmems, unsigned nr) 331 289 { 332 290 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) 291 + struct device *dev = pp->p.dev; 292 + unsigned i, niov_size; 333 293 dma_addr_t dma_addr; 334 294 335 - unsigned niov_size; 336 - 337 - if (!dma_dev_need_sync(pool->p.dev)) 295 + if (!dma_dev_need_sync(dev)) 338 296 return; 297 + niov_size = 1U << zcrx->niov_shift; 339 298 340 - niov_size = 1U << io_pp_to_ifq(pool)->niov_shift; 341 - dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); 342 - __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, 343 - niov_size, pool->p.dma_dir); 299 + for (i = 0; i < nr; i++) { 300 + dma_addr = page_pool_get_dma_addr_netmem(netmems[i]); 301 + __dma_sync_single_for_device(dev, dma_addr + pp->p.offset, 302 + niov_size, pp->p.dma_dir); 303 + } 344 304 #endif 345 305 } 346 306 ··· 386 390 return -EINVAL; 387 391 388 392 mmap_offset = IORING_MAP_OFF_ZCRX_REGION; 389 - mmap_offset += id << IORING_OFF_PBUF_SHIFT; 393 + mmap_offset += (u64)id << IORING_OFF_ZCRX_SHIFT; 390 394 391 - ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); 395 + ret = io_create_region(ctx, &ifq->rq_region, rd, mmap_offset); 392 396 if (ret < 0) 393 397 return ret; 394 398 395 - ptr = io_region_get_ptr(&ifq->region); 396 - ifq->rq_ring = (struct io_uring *)ptr; 397 - ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 399 + ptr = io_region_get_ptr(&ifq->rq_region); 400 + ifq->rq.ring = (struct io_uring *)ptr; 401 + ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off); 398 402 399 403 return 0; 400 404 } 401 405 402 406 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) 403 407 { 404 - io_free_region(ifq->user, &ifq->region); 405 - ifq->rq_ring = NULL; 406 - ifq->rqes = NULL; 408 + io_free_region(ifq->user, &ifq->rq_region); 409 + ifq->rq.ring = NULL; 410 + ifq->rq.rqes = NULL; 407 411 } 408 412 409 413 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, ··· 425 429 static int io_zcrx_append_area(struct io_zcrx_ifq *ifq, 426 430 struct io_zcrx_area *area) 427 431 { 428 - if (ifq->area) 432 + bool kern_readable = !area->mem.is_dmabuf; 433 + 434 + if (WARN_ON_ONCE(ifq->area)) 429 435 return -EINVAL; 436 + if (WARN_ON_ONCE(ifq->kern_readable != kern_readable)) 437 + return -EINVAL; 438 + 430 439 ifq->area = area; 431 440 return 0; 432 441 } ··· 451 450 return -EINVAL; 452 451 buf_size_shift = ilog2(reg->rx_buf_len); 453 452 } 453 + if (!ifq->dev && buf_size_shift != PAGE_SHIFT) 454 + return -EOPNOTSUPP; 454 455 455 456 ret = -ENOMEM; 456 457 area = kzalloc_obj(*area); ··· 463 460 ret = io_import_area(ifq, &area->mem, area_reg); 464 461 if (ret) 465 462 goto err; 463 + if (ifq->dev) 464 + area->is_mapped = true; 466 465 467 - if (buf_size_shift > io_area_max_shift(&area->mem)) { 466 + if (ifq->dev && buf_size_shift > io_area_max_shift(&area->mem)) { 468 467 ret = -ERANGE; 469 468 goto err; 470 469 } ··· 500 495 niov->type = NET_IOV_IOURING; 501 496 } 502 497 498 + if (ifq->dev) { 499 + ret = io_populate_area_dma(ifq, area); 500 + if (ret) 501 + goto err; 502 + } 503 + 503 504 area->free_count = nr_iovs; 504 505 /* we're only supporting one area per ifq for now */ 505 506 area->area_id = 0; ··· 530 519 return NULL; 531 520 532 521 ifq->if_rxq = -1; 533 - spin_lock_init(&ifq->rq_lock); 522 + spin_lock_init(&ifq->rq.lock); 534 523 mutex_init(&ifq->pp_lock); 535 524 refcount_set(&ifq->refs, 1); 536 525 refcount_set(&ifq->user_refs, 1); ··· 597 586 { 598 587 struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); 599 588 600 - spin_lock_bh(&area->freelist_lock); 589 + guard(spinlock_bh)(&area->freelist_lock); 601 590 area->freelist[area->free_count++] = net_iov_idx(niov); 602 - spin_unlock_bh(&area->freelist_lock); 591 + } 592 + 593 + static struct net_iov *zcrx_get_free_niov(struct io_zcrx_area *area) 594 + { 595 + unsigned niov_idx; 596 + 597 + lockdep_assert_held(&area->freelist_lock); 598 + 599 + if (unlikely(!area->free_count)) 600 + return NULL; 601 + 602 + niov_idx = area->freelist[--area->free_count]; 603 + return &area->nia.niovs[niov_idx]; 603 604 } 604 605 605 606 static void io_zcrx_return_niov(struct net_iov *niov) ··· 647 624 } 648 625 } 649 626 650 - static void zcrx_unregister(struct io_zcrx_ifq *ifq) 627 + static void zcrx_unregister_user(struct io_zcrx_ifq *ifq) 651 628 { 652 629 if (refcount_dec_and_test(&ifq->user_refs)) { 653 630 io_close_queue(ifq); 654 631 io_zcrx_scrub(ifq); 655 632 } 633 + } 634 + 635 + static void zcrx_unregister(struct io_zcrx_ifq *ifq) 636 + { 637 + zcrx_unregister_user(ifq); 656 638 io_put_zcrx_ifq(ifq); 657 639 } 658 640 ··· 668 640 669 641 lockdep_assert_held(&ctx->mmap_lock); 670 642 671 - return ifq ? &ifq->region : NULL; 643 + return ifq ? &ifq->rq_region : NULL; 672 644 } 673 645 674 646 static int zcrx_box_release(struct inode *inode, struct file *file) ··· 779 751 return ret; 780 752 } 781 753 782 - int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 783 - struct io_uring_zcrx_ifq_reg __user *arg) 754 + static int zcrx_register_netdev(struct io_zcrx_ifq *ifq, 755 + struct io_uring_zcrx_ifq_reg *reg, 756 + struct io_uring_zcrx_area_reg *area) 784 757 { 785 758 struct pp_memory_provider_params mp_param = {}; 759 + unsigned if_rxq = reg->if_rxq; 760 + int ret; 761 + 762 + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, 763 + reg->if_idx); 764 + if (!ifq->netdev) 765 + return -ENODEV; 766 + 767 + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 768 + 769 + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, if_rxq); 770 + if (!ifq->dev) { 771 + ret = -EOPNOTSUPP; 772 + goto netdev_put_unlock; 773 + } 774 + get_device(ifq->dev); 775 + 776 + ret = io_zcrx_create_area(ifq, area, reg); 777 + if (ret) 778 + goto netdev_put_unlock; 779 + 780 + if (reg->rx_buf_len) 781 + mp_param.rx_page_size = 1U << ifq->niov_shift; 782 + mp_param.mp_ops = &io_uring_pp_zc_ops; 783 + mp_param.mp_priv = ifq; 784 + ret = __net_mp_open_rxq(ifq->netdev, if_rxq, &mp_param, NULL); 785 + if (ret) 786 + goto netdev_put_unlock; 787 + 788 + ifq->if_rxq = if_rxq; 789 + ret = 0; 790 + netdev_put_unlock: 791 + netdev_unlock(ifq->netdev); 792 + return ret; 793 + } 794 + 795 + int io_register_zcrx(struct io_ring_ctx *ctx, 796 + struct io_uring_zcrx_ifq_reg __user *arg) 797 + { 786 798 struct io_uring_zcrx_area_reg area; 787 799 struct io_uring_zcrx_ifq_reg reg; 788 800 struct io_uring_region_desc rd; ··· 846 778 return -EFAULT; 847 779 if (!mem_is_zero(&reg.__resv, sizeof(reg.__resv)) || reg.zcrx_id) 848 780 return -EINVAL; 781 + if (reg.flags & ~ZCRX_SUPPORTED_REG_FLAGS) 782 + return -EINVAL; 849 783 if (reg.flags & ZCRX_REG_IMPORT) 850 784 return import_zcrx(ctx, arg, &reg); 851 785 if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) 852 786 return -EFAULT; 853 - if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) 787 + if (reg.if_rxq == -1 || !reg.rq_entries) 788 + return -EINVAL; 789 + if ((reg.if_rxq || reg.if_idx) && (reg.flags & ZCRX_REG_NODEV)) 854 790 return -EINVAL; 855 791 if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { 856 792 if (!(ctx->flags & IORING_SETUP_CLAMP)) ··· 878 806 mmgrab(ctx->mm_account); 879 807 ifq->mm_account = ctx->mm_account; 880 808 } 881 - ifq->rq_entries = reg.rq_entries; 809 + ifq->rq.nr_entries = reg.rq_entries; 882 810 883 811 scoped_guard(mutex, &ctx->mmap_lock) { 884 812 /* preallocate id */ ··· 891 819 if (ret) 892 820 goto err; 893 821 894 - ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); 895 - if (!ifq->netdev) { 896 - ret = -ENODEV; 897 - goto err; 822 + ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF); 823 + 824 + if (!(reg.flags & ZCRX_REG_NODEV)) { 825 + ret = zcrx_register_netdev(ifq, &reg, &area); 826 + if (ret) 827 + goto err; 828 + } else { 829 + ret = io_zcrx_create_area(ifq, &area, &reg); 830 + if (ret) 831 + goto err; 898 832 } 899 - netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 900 - 901 - ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); 902 - if (!ifq->dev) { 903 - ret = -EOPNOTSUPP; 904 - goto netdev_put_unlock; 905 - } 906 - get_device(ifq->dev); 907 - 908 - ret = io_zcrx_create_area(ifq, &area, &reg); 909 - if (ret) 910 - goto netdev_put_unlock; 911 - 912 - if (reg.rx_buf_len) 913 - mp_param.rx_page_size = 1U << ifq->niov_shift; 914 - mp_param.mp_ops = &io_uring_pp_zc_ops; 915 - mp_param.mp_priv = ifq; 916 - ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); 917 - if (ret) 918 - goto netdev_put_unlock; 919 - netdev_unlock(ifq->netdev); 920 - ifq->if_rxq = reg.if_rxq; 921 833 922 834 reg.zcrx_id = id; 923 835 ··· 921 865 goto err; 922 866 } 923 867 return 0; 924 - netdev_put_unlock: 925 - netdev_unlock(ifq->netdev); 926 868 err: 927 869 scoped_guard(mutex, &ctx->mmap_lock) 928 870 xa_erase(&ctx->zcrx_ctxs, id); ··· 929 875 return ret; 930 876 } 931 877 932 - static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) 878 + static inline bool is_zcrx_entry_marked(struct io_ring_ctx *ctx, unsigned long id) 933 879 { 934 - unsigned niov_idx; 935 - 936 - lockdep_assert_held(&area->freelist_lock); 937 - 938 - niov_idx = area->freelist[--area->free_count]; 939 - return &area->nia.niovs[niov_idx]; 880 + return xa_get_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); 940 881 } 941 882 942 - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 883 + static inline void set_zcrx_entry_mark(struct io_ring_ctx *ctx, unsigned long id) 884 + { 885 + xa_set_mark(&ctx->zcrx_ctxs, id, XA_MARK_1); 886 + } 887 + 888 + void io_terminate_zcrx(struct io_ring_ctx *ctx) 889 + { 890 + struct io_zcrx_ifq *ifq; 891 + unsigned long id = 0; 892 + 893 + lockdep_assert_held(&ctx->uring_lock); 894 + 895 + while (1) { 896 + scoped_guard(mutex, &ctx->mmap_lock) 897 + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 898 + if (!ifq) 899 + break; 900 + if (WARN_ON_ONCE(is_zcrx_entry_marked(ctx, id))) 901 + break; 902 + set_zcrx_entry_mark(ctx, id); 903 + id++; 904 + zcrx_unregister_user(ifq); 905 + } 906 + } 907 + 908 + void io_unregister_zcrx(struct io_ring_ctx *ctx) 943 909 { 944 910 struct io_zcrx_ifq *ifq; 945 911 ··· 970 896 unsigned long id = 0; 971 897 972 898 ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); 973 - if (ifq) 899 + if (ifq) { 900 + if (WARN_ON_ONCE(!is_zcrx_entry_marked(ctx, id))) { 901 + ifq = NULL; 902 + break; 903 + } 974 904 xa_erase(&ctx->zcrx_ctxs, id); 905 + } 975 906 } 976 907 if (!ifq) 977 908 break; 978 - zcrx_unregister(ifq); 909 + io_put_zcrx_ifq(ifq); 979 910 } 980 911 981 912 xa_destroy(&ctx->zcrx_ctxs); 982 913 } 983 914 984 - static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) 915 + static inline u32 zcrx_rq_entries(struct zcrx_rq *rq) 985 916 { 986 917 u32 entries; 987 918 988 - entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; 989 - return min(entries, ifq->rq_entries); 919 + entries = smp_load_acquire(&rq->ring->tail) - rq->cached_head; 920 + return min(entries, rq->nr_entries); 990 921 } 991 922 992 - static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, 993 - unsigned mask) 923 + static struct io_uring_zcrx_rqe *zcrx_next_rqe(struct zcrx_rq *rq, unsigned mask) 994 924 { 995 - unsigned int idx = ifq->cached_rq_head++ & mask; 925 + unsigned int idx = rq->cached_head++ & mask; 996 926 997 - return &ifq->rqes[idx]; 927 + return &rq->rqes[idx]; 998 928 } 999 929 1000 930 static inline bool io_parse_rqe(struct io_uring_zcrx_rqe *rqe, ··· 1024 946 return true; 1025 947 } 1026 948 1027 - static void io_zcrx_ring_refill(struct page_pool *pp, 1028 - struct io_zcrx_ifq *ifq) 949 + static unsigned io_zcrx_ring_refill(struct page_pool *pp, 950 + struct io_zcrx_ifq *ifq, 951 + netmem_ref *netmems, unsigned to_alloc) 1029 952 { 1030 - unsigned int mask = ifq->rq_entries - 1; 953 + struct zcrx_rq *rq = &ifq->rq; 954 + unsigned int mask = rq->nr_entries - 1; 1031 955 unsigned int entries; 956 + unsigned allocated = 0; 1032 957 1033 - guard(spinlock_bh)(&ifq->rq_lock); 958 + guard(spinlock_bh)(&rq->lock); 1034 959 1035 - entries = io_zcrx_rqring_entries(ifq); 1036 - entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL); 960 + entries = zcrx_rq_entries(rq); 961 + entries = min_t(unsigned, entries, to_alloc); 1037 962 if (unlikely(!entries)) 1038 - return; 963 + return 0; 1039 964 1040 965 do { 1041 - struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); 966 + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); 1042 967 struct net_iov *niov; 1043 968 netmem_ref netmem; 1044 969 ··· 1059 978 continue; 1060 979 } 1061 980 1062 - io_zcrx_sync_for_device(pp, niov); 1063 - net_mp_netmem_place_in_cache(pp, netmem); 981 + netmems[allocated] = netmem; 982 + allocated++; 1064 983 } while (--entries); 1065 984 1066 - smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); 985 + smp_store_release(&rq->ring->head, rq->cached_head); 986 + return allocated; 1067 987 } 1068 988 1069 - static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) 989 + static unsigned io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq, 990 + netmem_ref *netmems, unsigned to_alloc) 1070 991 { 1071 992 struct io_zcrx_area *area = ifq->area; 993 + unsigned allocated = 0; 1072 994 1073 - spin_lock_bh(&area->freelist_lock); 1074 - while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { 1075 - struct net_iov *niov = __io_zcrx_get_free_niov(area); 1076 - netmem_ref netmem = net_iov_to_netmem(niov); 995 + guard(spinlock_bh)(&area->freelist_lock); 1077 996 997 + for (allocated = 0; allocated < to_alloc; allocated++) { 998 + struct net_iov *niov = zcrx_get_free_niov(area); 999 + 1000 + if (!niov) 1001 + break; 1078 1002 net_mp_niov_set_page_pool(pp, niov); 1079 - io_zcrx_sync_for_device(pp, niov); 1080 - net_mp_netmem_place_in_cache(pp, netmem); 1003 + netmems[allocated] = net_iov_to_netmem(niov); 1081 1004 } 1082 - spin_unlock_bh(&area->freelist_lock); 1005 + return allocated; 1083 1006 } 1084 1007 1085 1008 static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) 1086 1009 { 1087 1010 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1011 + netmem_ref *netmems = pp->alloc.cache; 1012 + unsigned to_alloc = PP_ALLOC_CACHE_REFILL; 1013 + unsigned allocated; 1088 1014 1089 1015 /* pp should already be ensuring that */ 1090 - if (unlikely(pp->alloc.count)) 1016 + if (WARN_ON_ONCE(pp->alloc.count)) 1017 + return 0; 1018 + 1019 + allocated = io_zcrx_ring_refill(pp, ifq, netmems, to_alloc); 1020 + if (likely(allocated)) 1091 1021 goto out_return; 1092 1022 1093 - io_zcrx_ring_refill(pp, ifq); 1094 - if (likely(pp->alloc.count)) 1095 - goto out_return; 1096 - 1097 - io_zcrx_refill_slow(pp, ifq); 1098 - if (!pp->alloc.count) 1023 + allocated = io_zcrx_refill_slow(pp, ifq, netmems, to_alloc); 1024 + if (!allocated) 1099 1025 return 0; 1100 1026 out_return: 1101 - return pp->alloc.cache[--pp->alloc.count]; 1027 + zcrx_sync_for_device(pp, ifq, netmems, allocated); 1028 + allocated--; 1029 + pp->alloc.count += allocated; 1030 + return netmems[allocated]; 1102 1031 } 1103 1032 1104 1033 static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) ··· 1127 1036 static int io_pp_zc_init(struct page_pool *pp) 1128 1037 { 1129 1038 struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); 1130 - int ret; 1131 1039 1132 1040 if (WARN_ON_ONCE(!ifq)) 1133 1041 return -EINVAL; ··· 1138 1048 return -EINVAL; 1139 1049 if (pp->p.dma_dir != DMA_FROM_DEVICE) 1140 1050 return -EOPNOTSUPP; 1141 - 1142 - ret = io_zcrx_map_area(ifq, ifq->area); 1143 - if (ret) 1144 - return ret; 1145 1051 1146 1052 refcount_inc(&ifq->refs); 1147 1053 return 0; ··· 1186 1100 }; 1187 1101 1188 1102 static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, 1189 - struct io_zcrx_ifq *zcrx) 1103 + struct io_zcrx_ifq *zcrx, struct zcrx_rq *rq) 1190 1104 { 1191 - unsigned int mask = zcrx->rq_entries - 1; 1105 + unsigned int mask = rq->nr_entries - 1; 1192 1106 unsigned int i; 1193 1107 1194 - nr = min(nr, io_zcrx_rqring_entries(zcrx)); 1108 + nr = min(nr, zcrx_rq_entries(rq)); 1195 1109 for (i = 0; i < nr; i++) { 1196 - struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); 1110 + struct io_uring_zcrx_rqe *rqe = zcrx_next_rqe(rq, mask); 1197 1111 struct net_iov *niov; 1198 1112 1199 1113 if (!io_parse_rqe(rqe, zcrx, &niov)) ··· 1201 1115 netmem_array[i] = net_iov_to_netmem(niov); 1202 1116 } 1203 1117 1204 - smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); 1118 + smp_store_release(&rq->ring->head, rq->cached_head); 1205 1119 return i; 1206 1120 } 1207 1121 ··· 1235 1149 return -EINVAL; 1236 1150 1237 1151 do { 1238 - scoped_guard(spinlock_bh, &zcrx->rq_lock) { 1239 - nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); 1152 + struct zcrx_rq *rq = &zcrx->rq; 1153 + 1154 + scoped_guard(spinlock_bh, &rq->lock) { 1155 + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx, rq); 1240 1156 zcrx_return_buffers(netmems, nr); 1241 1157 } 1242 1158 ··· 1247 1159 if (fatal_signal_pending(current)) 1248 1160 break; 1249 1161 cond_resched(); 1250 - } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); 1162 + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq.nr_entries); 1251 1163 1252 1164 return 0; 1253 1165 } ··· 1256 1168 { 1257 1169 struct zcrx_ctrl ctrl; 1258 1170 struct io_zcrx_ifq *zcrx; 1171 + 1172 + BUILD_BUG_ON(sizeof(ctrl.zc_export) != sizeof(ctrl.zc_flush)); 1259 1173 1260 1174 if (nr_args) 1261 1175 return -EINVAL; ··· 1311 1221 struct io_zcrx_area *area = ifq->area; 1312 1222 struct net_iov *niov = NULL; 1313 1223 1314 - if (area->mem.is_dmabuf) 1224 + if (!ifq->kern_readable) 1315 1225 return NULL; 1316 1226 1317 - spin_lock_bh(&area->freelist_lock); 1318 - if (area->free_count) 1319 - niov = __io_zcrx_get_free_niov(area); 1320 - spin_unlock_bh(&area->freelist_lock); 1227 + scoped_guard(spinlock_bh, &area->freelist_lock) 1228 + niov = zcrx_get_free_niov(area); 1321 1229 1322 1230 if (niov) 1323 1231 page_pool_fragment_netmem(net_iov_to_netmem(niov), 1);
+23 -11
io_uring/zcrx.h
··· 8 8 #include <net/page_pool/types.h> 9 9 #include <net/net_trackers.h> 10 10 11 + #define ZCRX_SUPPORTED_REG_FLAGS (ZCRX_REG_IMPORT | ZCRX_REG_NODEV) 12 + #define ZCRX_FEATURES (ZCRX_FEATURE_RX_PAGE_SIZE) 13 + 11 14 struct io_zcrx_mem { 12 15 unsigned long size; 13 16 bool is_dmabuf; ··· 41 38 struct io_zcrx_mem mem; 42 39 }; 43 40 41 + struct zcrx_rq { 42 + spinlock_t lock; 43 + struct io_uring *ring; 44 + struct io_uring_zcrx_rqe *rqes; 45 + u32 cached_head; 46 + u32 nr_entries; 47 + }; 48 + 44 49 struct io_zcrx_ifq { 45 50 struct io_zcrx_area *area; 46 51 unsigned niov_shift; 47 52 struct user_struct *user; 48 53 struct mm_struct *mm_account; 54 + bool kern_readable; 49 55 50 - spinlock_t rq_lock ____cacheline_aligned_in_smp; 51 - struct io_uring *rq_ring; 52 - struct io_uring_zcrx_rqe *rqes; 53 - u32 cached_rq_head; 54 - u32 rq_entries; 56 + struct zcrx_rq rq ____cacheline_aligned_in_smp; 55 57 56 58 u32 if_rxq; 57 59 struct device *dev; ··· 71 63 * net stack. 72 64 */ 73 65 struct mutex pp_lock; 74 - struct io_mapped_region region; 66 + struct io_mapped_region rq_region; 75 67 }; 76 68 77 69 #if defined(CONFIG_IO_URING_ZCRX) 78 70 int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_arg); 79 - int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 71 + int io_register_zcrx(struct io_ring_ctx *ctx, 80 72 struct io_uring_zcrx_ifq_reg __user *arg); 81 - void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); 73 + void io_unregister_zcrx(struct io_ring_ctx *ctx); 74 + void io_terminate_zcrx(struct io_ring_ctx *ctx); 82 75 int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, 83 76 struct socket *sock, unsigned int flags, 84 77 unsigned issue_flags, unsigned int *len); 85 78 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, 86 79 unsigned int id); 87 80 #else 88 - static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, 89 - struct io_uring_zcrx_ifq_reg __user *arg) 81 + static inline int io_register_zcrx(struct io_ring_ctx *ctx, 82 + struct io_uring_zcrx_ifq_reg __user *arg) 90 83 { 91 84 return -EOPNOTSUPP; 92 85 } 93 - static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) 86 + static inline void io_unregister_zcrx(struct io_ring_ctx *ctx) 87 + { 88 + } 89 + static inline void io_terminate_zcrx(struct io_ring_ctx *ctx) 94 90 { 95 91 } 96 92 static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq,