Merge tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

+19 -10

include/linux/io_uring_types.h

··· 224 224 DECLARE_BITMAP(sqe_op, IORING_OP_LAST); 225 225 u8 sqe_flags_allowed; 226 226 u8 sqe_flags_required; 227 - bool registered; 227 + /* IORING_OP_* restrictions exist */ 228 + bool op_registered; 229 + /* IORING_REGISTER_* restrictions exist */ 230 + bool reg_registered; 228 231 }; 229 232 230 233 struct io_submit_link { ··· 262 259 struct { 263 260 unsigned int flags; 264 261 unsigned int drain_next: 1; 265 - unsigned int restricted: 1; 262 + unsigned int op_restricted: 1; 263 + unsigned int reg_restricted: 1; 266 264 unsigned int off_timeout_used: 1; 267 265 unsigned int drain_active: 1; 268 266 unsigned int has_evfd: 1; ··· 320 316 * manipulate the list, hence no extra locking is needed there. 321 317 */ 322 318 bool poll_multi_queue; 323 - struct io_wq_work_list iopoll_list; 319 + struct list_head iopoll_list; 324 320 325 321 struct io_file_table file_table; 326 322 struct io_rsrc_data buf_table; ··· 448 444 struct list_head defer_list; 449 445 unsigned nr_drained; 450 446 447 + /* protected by ->completion_lock */ 448 + unsigned nr_req_allocated; 449 + 451 450 #ifdef CONFIG_NET_RX_BUSY_POLL 452 451 struct list_head napi_list; /* track busy poll napi_id */ 453 452 spinlock_t napi_lock; /* napi_list lock */ ··· 462 455 463 456 DECLARE_HASHTABLE(napi_ht, 4); 464 457 #endif 465 - 466 - /* protected by ->completion_lock */ 467 - unsigned evfd_last_cq_tail; 468 - unsigned nr_req_allocated; 469 458 470 459 /* 471 460 * Protection for resize vs mmap races - both the mmap and resize ··· 717 714 718 715 atomic_t refs; 719 716 bool cancel_seq_set; 720 - struct io_task_work io_task_work; 717 + 718 + union { 719 + struct io_task_work io_task_work; 720 + /* For IOPOLL setup queues, with hybrid polling */ 721 + u64 iopoll_start; 722 + }; 723 + 721 724 union { 722 725 /* 723 726 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed 724 727 * poll 725 728 */ 726 729 struct hlist_node hash_node; 727 - /* For IOPOLL setup queues, with hybrid polling */ 728 - u64 iopoll_start; 730 + /* IOPOLL completion handling */ 731 + struct list_head iopoll_node; 729 732 /* for private io_kiocb freeing */ 730 733 struct rcu_head rcu_head; 731 734 };

+12

include/uapi/linux/io_uring.h

··· 237 237 */ 238 238 #define IORING_SETUP_SQE_MIXED (1U << 19) 239 239 240 + /* 241 + * When set, io_uring ignores SQ head and tail and fetches SQEs to submit 242 + * starting from index 0 instead from the index stored in the head pointer. 243 + * IOW, the user should place all SQE at the beginning of the SQ memory 244 + * before issuing a submission syscall. 245 + * 246 + * It requires IORING_SETUP_NO_SQARRAY and is incompatible with 247 + * IORING_SETUP_SQPOLL. The user must also never change the SQ head and tail 248 + * values and keep it set to 0. Any other value is undefined behaviour. 249 + */ 250 + #define IORING_SETUP_SQ_REWIND (1U << 20) 251 + 240 252 enum io_uring_op { 241 253 IORING_OP_NOP, 242 254 IORING_OP_READV,

+8 -6

io_uring/Makefile

··· 8 8 9 9 obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ 10 10 tctx.o filetable.o rw.o poll.o \ 11 - eventfd.o uring_cmd.o openclose.o \ 12 - sqpoll.o xattr.o nop.o fs.o splice.o \ 13 - sync.o msg_ring.o advise.o openclose.o \ 14 - statx.o timeout.o cancel.o \ 15 - waitid.o register.o truncate.o \ 16 - memmap.o alloc_cache.o query.o 11 + tw.o wait.o eventfd.o uring_cmd.o \ 12 + openclose.o sqpoll.o xattr.o nop.o \ 13 + fs.o splice.o sync.o msg_ring.o \ 14 + advise.o openclose.o statx.o timeout.o \ 15 + cancel.o waitid.o register.o \ 16 + truncate.o memmap.o alloc_cache.o \ 17 + query.o 18 + 17 19 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 18 20 obj-$(CONFIG_IO_WQ) += io-wq.o 19 21 obj-$(CONFIG_FUTEX) += futex.o

+2

io_uring/alloc_cache.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef IOU_ALLOC_CACHE_H 2 3 #define IOU_ALLOC_CACHE_H 3 4 4 5 #include <linux/io_uring_types.h> 6 + #include <linux/kasan.h> 5 7 6 8 /* 7 9 * Don't allow the cache to grow beyond this size.

+2 -3

io_uring/cancel.c

··· 2 2 #include <linux/kernel.h> 3 3 #include <linux/errno.h> 4 4 #include <linux/fs.h> 5 - #include <linux/file.h> 6 5 #include <linux/mm.h> 7 6 #include <linux/slab.h> 8 - #include <linux/namei.h> 9 7 #include <linux/nospec.h> 10 8 #include <linux/io_uring.h> 11 9 ··· 19 21 #include "waitid.h" 20 22 #include "futex.h" 21 23 #include "cancel.h" 24 + #include "wait.h" 22 25 23 26 struct io_cancel { 24 27 struct file *file; ··· 538 539 /* SQPOLL thread does its own polling */ 539 540 if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || 540 541 is_sqpoll_thread) { 541 - while (!wq_list_empty(&ctx->iopoll_list)) { 542 + while (!list_empty(&ctx->iopoll_list)) { 542 543 io_iopoll_try_reap_events(ctx); 543 544 ret = true; 544 545 cond_resched();

+1

io_uring/cmd_net.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 1 2 #include <asm/ioctls.h> 2 3 #include <linux/io_uring/net.h> 3 4 #include <linux/errqueue.h>

+1

io_uring/eventfd.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 2 3 struct io_ring_ctx; 3 4 int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

-1

io_uring/filetable.h

··· 2 2 #ifndef IOU_FILE_TABLE_H 3 3 #define IOU_FILE_TABLE_H 4 4 5 - #include <linux/file.h> 6 5 #include <linux/io_uring_types.h> 7 6 #include "rsrc.h" 8 7

+1 -1

io_uring/futex.c

··· 186 186 return -EINVAL; 187 187 188 188 ifd = kzalloc(struct_size_t(struct io_futexv_data, futexv, iof->futex_nr), 189 - GFP_KERNEL); 189 + GFP_KERNEL_ACCOUNT); 190 190 if (!ifd) 191 191 return -ENOMEM; 192 192

+48 -3

io_uring/io-wq.c

··· 17 17 #include <linux/task_work.h> 18 18 #include <linux/audit.h> 19 19 #include <linux/mmu_context.h> 20 + #include <linux/sched/sysctl.h> 20 21 #include <uapi/linux/io_uring.h> 21 22 22 23 #include "io-wq.h" ··· 35 34 36 35 enum { 37 36 IO_WQ_BIT_EXIT = 0, /* wq exiting */ 37 + IO_WQ_BIT_EXIT_ON_IDLE = 1, /* allow all workers to exit on idle */ 38 38 }; 39 39 40 40 enum { ··· 708 706 raw_spin_lock(&acct->workers_lock); 709 707 /* 710 708 * Last sleep timed out. Exit if we're not the last worker, 711 - * or if someone modified our affinity. 709 + * or if someone modified our affinity. If wq is marked 710 + * idle-exit, drop the worker as well. This is used to avoid 711 + * keeping io-wq workers around for tasks that no longer have 712 + * any active io_uring instances. 712 713 */ 713 - if (last_timeout && (exit_mask || acct->nr_workers > 1)) { 714 + if ((last_timeout && (exit_mask || acct->nr_workers > 1)) || 715 + test_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) { 714 716 acct->nr_workers--; 715 717 raw_spin_unlock(&acct->workers_lock); 716 718 __set_current_state(TASK_RUNNING); ··· 967 961 __set_notify_signal(worker->task); 968 962 wake_up_process(worker->task); 969 963 return false; 964 + } 965 + 966 + void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable) 967 + { 968 + if (!wq->task) 969 + return; 970 + 971 + if (!enable) { 972 + clear_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state); 973 + return; 974 + } 975 + 976 + if (test_and_set_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) 977 + return; 978 + 979 + rcu_read_lock(); 980 + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); 981 + rcu_read_unlock(); 970 982 } 971 983 972 984 static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) ··· 1337 1313 1338 1314 static void io_wq_exit_workers(struct io_wq *wq) 1339 1315 { 1316 + unsigned long timeout, warn_timeout; 1317 + 1340 1318 if (!wq->task) 1341 1319 return; 1342 1320 ··· 1348 1322 io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); 1349 1323 rcu_read_unlock(); 1350 1324 io_worker_ref_put(wq); 1351 - wait_for_completion(&wq->worker_done); 1325 + 1326 + /* 1327 + * Shut up hung task complaint, see for example 1328 + * 1329 + * https://lore.kernel.org/all/696fc9e7.a70a0220.111c58.0006.GAE@google.com/ 1330 + * 1331 + * where completely overloading the system with tons of long running 1332 + * io-wq items can easily trigger the hung task timeout. Only sleep 1333 + * uninterruptibly for half that time, and warn if we exceeded end 1334 + * up waiting more than IO_URING_EXIT_WAIT_MAX. 1335 + */ 1336 + timeout = sysctl_hung_task_timeout_secs * HZ / 2; 1337 + if (!timeout) 1338 + timeout = MAX_SCHEDULE_TIMEOUT; 1339 + warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX; 1340 + do { 1341 + if (wait_for_completion_timeout(&wq->worker_done, timeout)) 1342 + break; 1343 + WARN_ON_ONCE(time_after(jiffies, warn_timeout)); 1344 + } while (1); 1352 1345 1353 1346 spin_lock_irq(&wq->hash->wait.lock); 1354 1347 list_del_init(&wq->wait.entry);

+2

io_uring/io-wq.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef INTERNAL_IO_WQ_H 2 3 #define INTERNAL_IO_WQ_H 3 4 ··· 42 41 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); 43 42 void io_wq_exit_start(struct io_wq *wq); 44 43 void io_wq_put_and_exit(struct io_wq *wq); 44 + void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable); 45 45 46 46 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); 47 47 void io_wq_hash_work(struct io_wq_work *work, void *val);

+43 -739

io_uring/io_uring.c

··· 40 40 * Copyright (c) 2018-2019 Christoph Hellwig 41 41 */ 42 42 #include <linux/kernel.h> 43 - #include <linux/init.h> 44 43 #include <linux/errno.h> 45 44 #include <linux/syscalls.h> 46 - #include <net/compat.h> 47 45 #include <linux/refcount.h> 48 - #include <linux/uio.h> 49 46 #include <linux/bits.h> 50 47 51 48 #include <linux/sched/signal.h> 52 49 #include <linux/fs.h> 53 - #include <linux/file.h> 54 50 #include <linux/mm.h> 55 - #include <linux/mman.h> 56 51 #include <linux/percpu.h> 57 52 #include <linux/slab.h> 58 - #include <linux/bvec.h> 59 - #include <linux/net.h> 60 - #include <net/sock.h> 61 53 #include <linux/anon_inodes.h> 62 - #include <linux/sched/mm.h> 63 54 #include <linux/uaccess.h> 64 55 #include <linux/nospec.h> 65 - #include <linux/fsnotify.h> 66 - #include <linux/fadvise.h> 67 56 #include <linux/task_work.h> 68 57 #include <linux/io_uring.h> 69 58 #include <linux/io_uring/cmd.h> 70 59 #include <linux/audit.h> 71 60 #include <linux/security.h> 72 61 #include <linux/jump_label.h> 73 - #include <asm/shmparam.h> 74 62 75 63 #define CREATE_TRACE_POINTS 76 64 #include <trace/events/io_uring.h> ··· 93 105 #include "rw.h" 94 106 #include "alloc_cache.h" 95 107 #include "eventfd.h" 108 + #include "wait.h" 96 109 97 110 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 98 111 IOSQE_IO_HARDLINK | IOSQE_ASYNC) ··· 111 122 112 123 #define IO_COMPL_BATCH 32 113 124 #define IO_REQ_ALLOC_BATCH 8 114 - #define IO_LOCAL_TW_DEFAULT_MAX 20 115 125 116 126 /* requests with any of those set should undergo io_disarm_next() */ 117 127 #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) 118 - 119 - /* 120 - * No waiters. It's larger than any valid value of the tw counter 121 - * so that tests against ->cq_wait_nr would fail and skip wake_up(). 122 - */ 123 - #define IO_CQ_WAKE_INIT (-1U) 124 - /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 125 - #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 126 128 127 129 static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); 128 130 static void __io_req_caches_free(struct io_ring_ctx *ctx); ··· 167 187 req->link = IO_URING_PTR_POISON; 168 188 } 169 189 170 - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 171 - { 172 - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 173 - } 174 - 175 - static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) 176 - { 177 - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 178 - } 179 - 180 190 static inline void req_fail_link_node(struct io_kiocb *req, int res) 181 191 { 182 192 req_set_fail(req); ··· 185 215 struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); 186 216 187 217 complete(&ctx->ref_comp); 188 - } 189 - 190 - /* 191 - * Terminate the request if either of these conditions are true: 192 - * 193 - * 1) It's being executed by the original task, but that task is marked 194 - * with PF_EXITING as it's exiting. 195 - * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 196 - * our fallback task_work. 197 - * 3) The ring has been closed and is going away. 198 - */ 199 - static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 200 - { 201 - return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); 202 - } 203 - 204 - static __cold void io_fallback_req_func(struct work_struct *work) 205 - { 206 - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 207 - fallback_work.work); 208 - struct llist_node *node = llist_del_all(&ctx->fallback_llist); 209 - struct io_kiocb *req, *tmp; 210 - struct io_tw_state ts = {}; 211 - 212 - percpu_ref_get(&ctx->refs); 213 - mutex_lock(&ctx->uring_lock); 214 - ts.cancel = io_should_terminate_tw(ctx); 215 - llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 216 - req->io_task_work.func((struct io_tw_req){req}, ts); 217 - io_submit_flush_completions(ctx); 218 - mutex_unlock(&ctx->uring_lock); 219 - percpu_ref_put(&ctx->refs); 220 218 } 221 219 222 220 static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) ··· 272 334 init_waitqueue_head(&ctx->poll_wq); 273 335 spin_lock_init(&ctx->completion_lock); 274 336 raw_spin_lock_init(&ctx->timeout_lock); 275 - INIT_WQ_LIST(&ctx->iopoll_list); 337 + INIT_LIST_HEAD(&ctx->iopoll_list); 276 338 INIT_LIST_HEAD(&ctx->defer_list); 277 339 INIT_LIST_HEAD(&ctx->timeout_list); 278 340 INIT_LIST_HEAD(&ctx->ltimeout_list); ··· 581 643 __io_cqring_overflow_flush(ctx, true); 582 644 } 583 645 584 - static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 646 + void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 585 647 { 586 648 mutex_lock(&ctx->uring_lock); 587 649 __io_cqring_overflow_flush(ctx, false); ··· 1021 1083 return nxt; 1022 1084 } 1023 1085 1024 - static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) 1025 - { 1026 - if (!ctx) 1027 - return; 1028 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1029 - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1030 - 1031 - io_submit_flush_completions(ctx); 1032 - mutex_unlock(&ctx->uring_lock); 1033 - percpu_ref_put(&ctx->refs); 1034 - } 1035 - 1036 - /* 1037 - * Run queued task_work, returning the number of entries processed in *count. 1038 - * If more entries than max_entries are available, stop processing once this 1039 - * is reached and return the rest of the list. 1040 - */ 1041 - struct llist_node *io_handle_tw_list(struct llist_node *node, 1042 - unsigned int *count, 1043 - unsigned int max_entries) 1044 - { 1045 - struct io_ring_ctx *ctx = NULL; 1046 - struct io_tw_state ts = { }; 1047 - 1048 - do { 1049 - struct llist_node *next = node->next; 1050 - struct io_kiocb *req = container_of(node, struct io_kiocb, 1051 - io_task_work.node); 1052 - 1053 - if (req->ctx != ctx) { 1054 - ctx_flush_and_put(ctx, ts); 1055 - ctx = req->ctx; 1056 - mutex_lock(&ctx->uring_lock); 1057 - percpu_ref_get(&ctx->refs); 1058 - ts.cancel = io_should_terminate_tw(ctx); 1059 - } 1060 - INDIRECT_CALL_2(req->io_task_work.func, 1061 - io_poll_task_func, io_req_rw_complete, 1062 - (struct io_tw_req){req}, ts); 1063 - node = next; 1064 - (*count)++; 1065 - if (unlikely(need_resched())) { 1066 - ctx_flush_and_put(ctx, ts); 1067 - ctx = NULL; 1068 - cond_resched(); 1069 - } 1070 - } while (node && *count < max_entries); 1071 - 1072 - ctx_flush_and_put(ctx, ts); 1073 - return node; 1074 - } 1075 - 1076 - static __cold void __io_fallback_tw(struct llist_node *node, bool sync) 1077 - { 1078 - struct io_ring_ctx *last_ctx = NULL; 1079 - struct io_kiocb *req; 1080 - 1081 - while (node) { 1082 - req = container_of(node, struct io_kiocb, io_task_work.node); 1083 - node = node->next; 1084 - if (last_ctx != req->ctx) { 1085 - if (last_ctx) { 1086 - if (sync) 1087 - flush_delayed_work(&last_ctx->fallback_work); 1088 - percpu_ref_put(&last_ctx->refs); 1089 - } 1090 - last_ctx = req->ctx; 1091 - percpu_ref_get(&last_ctx->refs); 1092 - } 1093 - if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) 1094 - schedule_delayed_work(&last_ctx->fallback_work, 1); 1095 - } 1096 - 1097 - if (last_ctx) { 1098 - if (sync) 1099 - flush_delayed_work(&last_ctx->fallback_work); 1100 - percpu_ref_put(&last_ctx->refs); 1101 - } 1102 - } 1103 - 1104 - static void io_fallback_tw(struct io_uring_task *tctx, bool sync) 1105 - { 1106 - struct llist_node *node = llist_del_all(&tctx->task_list); 1107 - 1108 - __io_fallback_tw(node, sync); 1109 - } 1110 - 1111 - struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, 1112 - unsigned int max_entries, 1113 - unsigned int *count) 1114 - { 1115 - struct llist_node *node; 1116 - 1117 - node = llist_del_all(&tctx->task_list); 1118 - if (node) { 1119 - node = llist_reverse_order(node); 1120 - node = io_handle_tw_list(node, count, max_entries); 1121 - } 1122 - 1123 - /* relaxed read is enough as only the task itself sets ->in_cancel */ 1124 - if (unlikely(atomic_read(&tctx->in_cancel))) 1125 - io_uring_drop_tctx_refs(current); 1126 - 1127 - trace_io_uring_task_work_run(tctx, *count); 1128 - return node; 1129 - } 1130 - 1131 - void tctx_task_work(struct callback_head *cb) 1132 - { 1133 - struct io_uring_task *tctx; 1134 - struct llist_node *ret; 1135 - unsigned int count = 0; 1136 - 1137 - tctx = container_of(cb, struct io_uring_task, task_work); 1138 - ret = tctx_task_work_run(tctx, UINT_MAX, &count); 1139 - /* can't happen */ 1140 - WARN_ON_ONCE(ret); 1141 - } 1142 - 1143 - static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 1144 - { 1145 - struct io_ring_ctx *ctx = req->ctx; 1146 - unsigned nr_wait, nr_tw, nr_tw_prev; 1147 - struct llist_node *head; 1148 - 1149 - /* See comment above IO_CQ_WAKE_INIT */ 1150 - BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 1151 - 1152 - /* 1153 - * We don't know how many requests there are in the link and whether 1154 - * they can even be queued lazily, fall back to non-lazy. 1155 - */ 1156 - if (req->flags & IO_REQ_LINK_FLAGS) 1157 - flags &= ~IOU_F_TWQ_LAZY_WAKE; 1158 - 1159 - guard(rcu)(); 1160 - 1161 - head = READ_ONCE(ctx->work_llist.first); 1162 - do { 1163 - nr_tw_prev = 0; 1164 - if (head) { 1165 - struct io_kiocb *first_req = container_of(head, 1166 - struct io_kiocb, 1167 - io_task_work.node); 1168 - /* 1169 - * Might be executed at any moment, rely on 1170 - * SLAB_TYPESAFE_BY_RCU to keep it alive. 1171 - */ 1172 - nr_tw_prev = READ_ONCE(first_req->nr_tw); 1173 - } 1174 - 1175 - /* 1176 - * Theoretically, it can overflow, but that's fine as one of 1177 - * previous adds should've tried to wake the task. 1178 - */ 1179 - nr_tw = nr_tw_prev + 1; 1180 - if (!(flags & IOU_F_TWQ_LAZY_WAKE)) 1181 - nr_tw = IO_CQ_WAKE_FORCE; 1182 - 1183 - req->nr_tw = nr_tw; 1184 - req->io_task_work.node.next = head; 1185 - } while (!try_cmpxchg(&ctx->work_llist.first, &head, 1186 - &req->io_task_work.node)); 1187 - 1188 - /* 1189 - * cmpxchg implies a full barrier, which pairs with the barrier 1190 - * in set_current_state() on the io_cqring_wait() side. It's used 1191 - * to ensure that either we see updated ->cq_wait_nr, or waiters 1192 - * going to sleep will observe the work added to the list, which 1193 - * is similar to the wait/wawke task state sync. 1194 - */ 1195 - 1196 - if (!head) { 1197 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1198 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1199 - if (ctx->has_evfd) 1200 - io_eventfd_signal(ctx, false); 1201 - } 1202 - 1203 - nr_wait = atomic_read(&ctx->cq_wait_nr); 1204 - /* not enough or no one is waiting */ 1205 - if (nr_tw < nr_wait) 1206 - return; 1207 - /* the previous add has already woken it up */ 1208 - if (nr_tw_prev >= nr_wait) 1209 - return; 1210 - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 1211 - } 1212 - 1213 - static void io_req_normal_work_add(struct io_kiocb *req) 1214 - { 1215 - struct io_uring_task *tctx = req->tctx; 1216 - struct io_ring_ctx *ctx = req->ctx; 1217 - 1218 - /* task_work already pending, we're done */ 1219 - if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 1220 - return; 1221 - 1222 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1223 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1224 - 1225 - /* SQPOLL doesn't need the task_work added, it'll run it itself */ 1226 - if (ctx->flags & IORING_SETUP_SQPOLL) { 1227 - __set_notify_signal(tctx->task); 1228 - return; 1229 - } 1230 - 1231 - if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) 1232 - return; 1233 - 1234 - io_fallback_tw(tctx, false); 1235 - } 1236 - 1237 - void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) 1238 - { 1239 - if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) 1240 - io_req_local_work_add(req, flags); 1241 - else 1242 - io_req_normal_work_add(req); 1243 - } 1244 - 1245 - void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) 1246 - { 1247 - if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) 1248 - return; 1249 - __io_req_task_work_add(req, flags); 1250 - } 1251 - 1252 - static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 1253 - { 1254 - struct llist_node *node = llist_del_all(&ctx->work_llist); 1255 - 1256 - __io_fallback_tw(node, false); 1257 - node = llist_del_all(&ctx->retry_llist); 1258 - __io_fallback_tw(node, false); 1259 - } 1260 - 1261 - static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 1262 - int min_events) 1263 - { 1264 - if (!io_local_work_pending(ctx)) 1265 - return false; 1266 - if (events < min_events) 1267 - return true; 1268 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1269 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1270 - return false; 1271 - } 1272 - 1273 - static int __io_run_local_work_loop(struct llist_node **node, 1274 - io_tw_token_t tw, 1275 - int events) 1276 - { 1277 - int ret = 0; 1278 - 1279 - while (*node) { 1280 - struct llist_node *next = (*node)->next; 1281 - struct io_kiocb *req = container_of(*node, struct io_kiocb, 1282 - io_task_work.node); 1283 - INDIRECT_CALL_2(req->io_task_work.func, 1284 - io_poll_task_func, io_req_rw_complete, 1285 - (struct io_tw_req){req}, tw); 1286 - *node = next; 1287 - if (++ret >= events) 1288 - break; 1289 - } 1290 - 1291 - return ret; 1292 - } 1293 - 1294 - static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, 1295 - int min_events, int max_events) 1296 - { 1297 - struct llist_node *node; 1298 - unsigned int loops = 0; 1299 - int ret = 0; 1300 - 1301 - if (WARN_ON_ONCE(ctx->submitter_task != current)) 1302 - return -EEXIST; 1303 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 1304 - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 1305 - again: 1306 - tw.cancel = io_should_terminate_tw(ctx); 1307 - min_events -= ret; 1308 - ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 1309 - if (ctx->retry_llist.first) 1310 - goto retry_done; 1311 - 1312 - /* 1313 - * llists are in reverse order, flip it back the right way before 1314 - * running the pending items. 1315 - */ 1316 - node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 1317 - ret += __io_run_local_work_loop(&node, tw, max_events - ret); 1318 - ctx->retry_llist.first = node; 1319 - loops++; 1320 - 1321 - if (io_run_local_work_continue(ctx, ret, min_events)) 1322 - goto again; 1323 - retry_done: 1324 - io_submit_flush_completions(ctx); 1325 - if (io_run_local_work_continue(ctx, ret, min_events)) 1326 - goto again; 1327 - 1328 - trace_io_uring_local_work_run(ctx, ret, loops); 1329 - return ret; 1330 - } 1331 - 1332 - static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, 1333 - int min_events) 1334 - { 1335 - struct io_tw_state ts = {}; 1336 - 1337 - if (!io_local_work_pending(ctx)) 1338 - return 0; 1339 - return __io_run_local_work(ctx, ts, min_events, 1340 - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 1341 - } 1342 - 1343 - int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) 1344 - { 1345 - struct io_tw_state ts = {}; 1346 - int ret; 1347 - 1348 - mutex_lock(&ctx->uring_lock); 1349 - ret = __io_run_local_work(ctx, ts, min_events, max_events); 1350 - mutex_unlock(&ctx->uring_lock); 1351 - return ret; 1352 - } 1353 - 1354 1086 static void io_req_task_cancel(struct io_tw_req tw_req, io_tw_token_t tw) 1355 1087 { 1356 1088 struct io_kiocb *req = tw_req.req; ··· 1153 1545 ctx->submit_state.cq_flush = false; 1154 1546 } 1155 1547 1156 - static unsigned io_cqring_events(struct io_ring_ctx *ctx) 1157 - { 1158 - /* See comment at the top of this file */ 1159 - smp_rmb(); 1160 - return __io_cqring_events(ctx); 1161 - } 1162 - 1163 1548 /* 1164 1549 * We can't just wait for polled events to come to us, we have to actively 1165 1550 * find and complete them. ··· 1163 1562 return; 1164 1563 1165 1564 mutex_lock(&ctx->uring_lock); 1166 - while (!wq_list_empty(&ctx->iopoll_list)) { 1565 + while (!list_empty(&ctx->iopoll_list)) { 1167 1566 /* let it sleep and repeat later if can't complete a request */ 1168 1567 if (io_do_iopoll(ctx, true) == 0) 1169 1568 break; ··· 1228 1627 * forever, while the workqueue is stuck trying to acquire the 1229 1628 * very same mutex. 1230 1629 */ 1231 - if (wq_list_empty(&ctx->iopoll_list) || 1232 - io_task_work_pending(ctx)) { 1630 + if (list_empty(&ctx->iopoll_list) || io_task_work_pending(ctx)) { 1233 1631 u32 tail = ctx->cached_cq_tail; 1234 1632 1235 1633 (void) io_run_local_work_locked(ctx, min_events); 1236 1634 1237 - if (task_work_pending(current) || 1238 - wq_list_empty(&ctx->iopoll_list)) { 1635 + if (task_work_pending(current) || list_empty(&ctx->iopoll_list)) { 1239 1636 mutex_unlock(&ctx->uring_lock); 1240 1637 io_run_task_work(); 1241 1638 mutex_lock(&ctx->uring_lock); 1242 1639 } 1243 1640 /* some requests don't go through iopoll_list */ 1244 - if (tail != ctx->cached_cq_tail || 1245 - wq_list_empty(&ctx->iopoll_list)) 1641 + if (tail != ctx->cached_cq_tail || list_empty(&ctx->iopoll_list)) 1246 1642 break; 1247 1643 } 1248 1644 ret = io_do_iopoll(ctx, !min_events); ··· 1282 1684 * how we do polling eventually, not spinning if we're on potentially 1283 1685 * different devices. 1284 1686 */ 1285 - if (wq_list_empty(&ctx->iopoll_list)) { 1687 + if (list_empty(&ctx->iopoll_list)) { 1286 1688 ctx->poll_multi_queue = false; 1287 1689 } else if (!ctx->poll_multi_queue) { 1288 1690 struct io_kiocb *list_req; 1289 1691 1290 - list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, 1291 - comp_list); 1692 + list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, iopoll_node); 1292 1693 if (list_req->file != req->file) 1293 1694 ctx->poll_multi_queue = true; 1294 1695 } 1295 1696 1296 - /* 1297 - * For fast devices, IO may have already completed. If it has, add 1298 - * it to the front so we find it first. 1299 - */ 1300 - if (READ_ONCE(req->iopoll_completed)) 1301 - wq_list_add_head(&req->comp_list, &ctx->iopoll_list); 1302 - else 1303 - wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); 1697 + list_add_tail(&req->iopoll_node, &ctx->iopoll_list); 1304 1698 1305 1699 if (unlikely(needs_lock)) { 1306 1700 /* ··· 1670 2080 struct io_kiocb *req, 1671 2081 unsigned int sqe_flags) 1672 2082 { 2083 + if (!ctx->op_restricted) 2084 + return true; 1673 2085 if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) 1674 2086 return false; 1675 2087 ··· 1773 2181 io_init_drain(ctx); 1774 2182 } 1775 2183 } 1776 - if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { 1777 - if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) 2184 + if (unlikely(ctx->op_restricted || ctx->drain_active || ctx->drain_next)) { 2185 + if (!io_check_restriction(ctx, req, sqe_flags)) 1778 2186 return io_init_fail_req(req, -EACCES); 1779 2187 /* knock it to the slow queue path, will be drained there */ 1780 2188 if (ctx->drain_active) ··· 1946 2354 { 1947 2355 struct io_rings *rings = ctx->rings; 1948 2356 1949 - /* 1950 - * Ensure any loads from the SQEs are done at this point, 1951 - * since once we write the new head, the application could 1952 - * write new data to them. 1953 - */ 1954 - smp_store_release(&rings->sq.head, ctx->cached_sq_head); 2357 + if (ctx->flags & IORING_SETUP_SQ_REWIND) { 2358 + ctx->cached_sq_head = 0; 2359 + } else { 2360 + /* 2361 + * Ensure any loads from the SQEs are done at this point, 2362 + * since once we write the new head, the application could 2363 + * write new data to them. 2364 + */ 2365 + smp_store_release(&rings->sq.head, ctx->cached_sq_head); 2366 + } 1955 2367 } 1956 2368 1957 2369 /* ··· 2001 2405 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) 2002 2406 __must_hold(&ctx->uring_lock) 2003 2407 { 2004 - unsigned int entries = io_sqring_entries(ctx); 2408 + unsigned int entries; 2005 2409 unsigned int left; 2006 2410 int ret; 2411 + 2412 + if (ctx->flags & IORING_SETUP_SQ_REWIND) 2413 + entries = ctx->sq_entries; 2414 + else 2415 + entries = io_sqring_entries(ctx); 2007 2416 2008 2417 entries = min(nr, entries); 2009 2418 if (unlikely(!entries)) ··· 2052 2451 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 2053 2452 io_commit_sqring(ctx); 2054 2453 return ret; 2055 - } 2056 - 2057 - static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 2058 - int wake_flags, void *key) 2059 - { 2060 - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); 2061 - 2062 - /* 2063 - * Cannot safely flush overflowed CQEs from here, ensure we wake up 2064 - * the task, and the next invocation will do it. 2065 - */ 2066 - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 2067 - return autoremove_wake_function(curr, mode, wake_flags, key); 2068 - return -1; 2069 - } 2070 - 2071 - int io_run_task_work_sig(struct io_ring_ctx *ctx) 2072 - { 2073 - if (io_local_work_pending(ctx)) { 2074 - __set_current_state(TASK_RUNNING); 2075 - if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) 2076 - return 0; 2077 - } 2078 - if (io_run_task_work() > 0) 2079 - return 0; 2080 - if (task_sigpending(current)) 2081 - return -EINTR; 2082 - return 0; 2083 - } 2084 - 2085 - static bool current_pending_io(void) 2086 - { 2087 - struct io_uring_task *tctx = current->io_uring; 2088 - 2089 - if (!tctx) 2090 - return false; 2091 - return percpu_counter_read_positive(&tctx->inflight); 2092 - } 2093 - 2094 - static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) 2095 - { 2096 - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 2097 - 2098 - WRITE_ONCE(iowq->hit_timeout, 1); 2099 - iowq->min_timeout = 0; 2100 - wake_up_process(iowq->wq.private); 2101 - return HRTIMER_NORESTART; 2102 - } 2103 - 2104 - /* 2105 - * Doing min_timeout portion. If we saw any timeouts, events, or have work, 2106 - * wake up. If not, and we have a normal timeout, switch to that and keep 2107 - * sleeping. 2108 - */ 2109 - static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) 2110 - { 2111 - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 2112 - struct io_ring_ctx *ctx = iowq->ctx; 2113 - 2114 - /* no general timeout, or shorter (or equal), we are done */ 2115 - if (iowq->timeout == KTIME_MAX || 2116 - ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) 2117 - goto out_wake; 2118 - /* work we may need to run, wake function will see if we need to wake */ 2119 - if (io_has_work(ctx)) 2120 - goto out_wake; 2121 - /* got events since we started waiting, min timeout is done */ 2122 - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) 2123 - goto out_wake; 2124 - /* if we have any events and min timeout expired, we're done */ 2125 - if (io_cqring_events(ctx)) 2126 - goto out_wake; 2127 - 2128 - /* 2129 - * If using deferred task_work running and application is waiting on 2130 - * more than one request, ensure we reset it now where we are switching 2131 - * to normal sleeps. Any request completion post min_wait should wake 2132 - * the task and return. 2133 - */ 2134 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2135 - atomic_set(&ctx->cq_wait_nr, 1); 2136 - smp_mb(); 2137 - if (!llist_empty(&ctx->work_llist)) 2138 - goto out_wake; 2139 - } 2140 - 2141 - /* any generated CQE posted past this time should wake us up */ 2142 - iowq->cq_tail = iowq->cq_min_tail; 2143 - 2144 - hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); 2145 - hrtimer_set_expires(timer, iowq->timeout); 2146 - return HRTIMER_RESTART; 2147 - out_wake: 2148 - return io_cqring_timer_wakeup(timer); 2149 - } 2150 - 2151 - static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, 2152 - clockid_t clock_id, ktime_t start_time) 2153 - { 2154 - ktime_t timeout; 2155 - 2156 - if (iowq->min_timeout) { 2157 - timeout = ktime_add_ns(iowq->min_timeout, start_time); 2158 - hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, 2159 - HRTIMER_MODE_ABS); 2160 - } else { 2161 - timeout = iowq->timeout; 2162 - hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, 2163 - HRTIMER_MODE_ABS); 2164 - } 2165 - 2166 - hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); 2167 - hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); 2168 - 2169 - if (!READ_ONCE(iowq->hit_timeout)) 2170 - schedule(); 2171 - 2172 - hrtimer_cancel(&iowq->t); 2173 - destroy_hrtimer_on_stack(&iowq->t); 2174 - __set_current_state(TASK_RUNNING); 2175 - 2176 - return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; 2177 - } 2178 - 2179 - struct ext_arg { 2180 - size_t argsz; 2181 - struct timespec64 ts; 2182 - const sigset_t __user *sig; 2183 - ktime_t min_time; 2184 - bool ts_set; 2185 - bool iowait; 2186 - }; 2187 - 2188 - static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2189 - struct io_wait_queue *iowq, 2190 - struct ext_arg *ext_arg, 2191 - ktime_t start_time) 2192 - { 2193 - int ret = 0; 2194 - 2195 - /* 2196 - * Mark us as being in io_wait if we have pending requests, so cpufreq 2197 - * can take into account that the task is waiting for IO - turns out 2198 - * to be important for low QD IO. 2199 - */ 2200 - if (ext_arg->iowait && current_pending_io()) 2201 - current->in_iowait = 1; 2202 - if (iowq->timeout != KTIME_MAX || iowq->min_timeout) 2203 - ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); 2204 - else 2205 - schedule(); 2206 - current->in_iowait = 0; 2207 - return ret; 2208 - } 2209 - 2210 - /* If this returns > 0, the caller should retry */ 2211 - static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2212 - struct io_wait_queue *iowq, 2213 - struct ext_arg *ext_arg, 2214 - ktime_t start_time) 2215 - { 2216 - if (unlikely(READ_ONCE(ctx->check_cq))) 2217 - return 1; 2218 - if (unlikely(io_local_work_pending(ctx))) 2219 - return 1; 2220 - if (unlikely(task_work_pending(current))) 2221 - return 1; 2222 - if (unlikely(task_sigpending(current))) 2223 - return -EINTR; 2224 - if (unlikely(io_should_wake(iowq))) 2225 - return 0; 2226 - 2227 - return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); 2228 - } 2229 - 2230 - /* 2231 - * Wait until events become available, if we don't already have some. The 2232 - * application must reap them itself, as they reside on the shared cq ring. 2233 - */ 2234 - static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 2235 - struct ext_arg *ext_arg) 2236 - { 2237 - struct io_wait_queue iowq; 2238 - struct io_rings *rings = ctx->rings; 2239 - ktime_t start_time; 2240 - int ret; 2241 - 2242 - min_events = min_t(int, min_events, ctx->cq_entries); 2243 - 2244 - if (!io_allowed_run_tw(ctx)) 2245 - return -EEXIST; 2246 - if (io_local_work_pending(ctx)) 2247 - io_run_local_work(ctx, min_events, 2248 - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 2249 - io_run_task_work(); 2250 - 2251 - if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) 2252 - io_cqring_do_overflow_flush(ctx); 2253 - if (__io_cqring_events_user(ctx) >= min_events) 2254 - return 0; 2255 - 2256 - init_waitqueue_func_entry(&iowq.wq, io_wake_function); 2257 - iowq.wq.private = current; 2258 - INIT_LIST_HEAD(&iowq.wq.entry); 2259 - iowq.ctx = ctx; 2260 - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 2261 - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); 2262 - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 2263 - iowq.hit_timeout = 0; 2264 - iowq.min_timeout = ext_arg->min_time; 2265 - iowq.timeout = KTIME_MAX; 2266 - start_time = io_get_time(ctx); 2267 - 2268 - if (ext_arg->ts_set) { 2269 - iowq.timeout = timespec64_to_ktime(ext_arg->ts); 2270 - if (!(flags & IORING_ENTER_ABS_TIMER)) 2271 - iowq.timeout = ktime_add(iowq.timeout, start_time); 2272 - } 2273 - 2274 - if (ext_arg->sig) { 2275 - #ifdef CONFIG_COMPAT 2276 - if (in_compat_syscall()) 2277 - ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, 2278 - ext_arg->argsz); 2279 - else 2280 - #endif 2281 - ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); 2282 - 2283 - if (ret) 2284 - return ret; 2285 - } 2286 - 2287 - io_napi_busy_loop(ctx, &iowq); 2288 - 2289 - trace_io_uring_cqring_wait(ctx, min_events); 2290 - do { 2291 - unsigned long check_cq; 2292 - int nr_wait; 2293 - 2294 - /* if min timeout has been hit, don't reset wait count */ 2295 - if (!iowq.hit_timeout) 2296 - nr_wait = (int) iowq.cq_tail - 2297 - READ_ONCE(ctx->rings->cq.tail); 2298 - else 2299 - nr_wait = 1; 2300 - 2301 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2302 - atomic_set(&ctx->cq_wait_nr, nr_wait); 2303 - set_current_state(TASK_INTERRUPTIBLE); 2304 - } else { 2305 - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 2306 - TASK_INTERRUPTIBLE); 2307 - } 2308 - 2309 - ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); 2310 - __set_current_state(TASK_RUNNING); 2311 - atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 2312 - 2313 - /* 2314 - * Run task_work after scheduling and before io_should_wake(). 2315 - * If we got woken because of task_work being processed, run it 2316 - * now rather than let the caller do another wait loop. 2317 - */ 2318 - if (io_local_work_pending(ctx)) 2319 - io_run_local_work(ctx, nr_wait, nr_wait); 2320 - io_run_task_work(); 2321 - 2322 - /* 2323 - * Non-local task_work will be run on exit to userspace, but 2324 - * if we're using DEFER_TASKRUN, then we could have waited 2325 - * with a timeout for a number of requests. If the timeout 2326 - * hits, we could have some requests ready to process. Ensure 2327 - * this break is _after_ we have run task_work, to avoid 2328 - * deferring running potentially pending requests until the 2329 - * next time we wait for events. 2330 - */ 2331 - if (ret < 0) 2332 - break; 2333 - 2334 - check_cq = READ_ONCE(ctx->check_cq); 2335 - if (unlikely(check_cq)) { 2336 - /* let the caller flush overflows, retry */ 2337 - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 2338 - io_cqring_do_overflow_flush(ctx); 2339 - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { 2340 - ret = -EBADR; 2341 - break; 2342 - } 2343 - } 2344 - 2345 - if (io_should_wake(&iowq)) { 2346 - ret = 0; 2347 - break; 2348 - } 2349 - cond_resched(); 2350 - } while (1); 2351 - 2352 - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 2353 - finish_wait(&ctx->cq_wait, &iowq.wq); 2354 - restore_saved_sigmask_unless(ret == -EINTR); 2355 - 2356 - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 2357 2454 } 2358 2455 2359 2456 static void io_rings_free(struct io_ring_ctx *ctx) ··· 2283 2984 static __cold void io_ring_exit_work(struct work_struct *work) 2284 2985 { 2285 2986 struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); 2286 - unsigned long timeout = jiffies + HZ * 60 * 5; 2987 + unsigned long timeout = jiffies + IO_URING_EXIT_WAIT_MAX; 2287 2988 unsigned long interval = HZ / 20; 2288 2989 struct io_tctx_exit exit; 2289 2990 struct io_tctx_node *node; ··· 2555 3256 2556 3257 ctx = file->private_data; 2557 3258 ret = -EBADFD; 2558 - if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) 3259 + /* 3260 + * Keep IORING_SETUP_R_DISABLED check before submitter_task load 3261 + * in io_uring_add_tctx_node() -> __io_uring_add_tctx_node_from_submit() 3262 + */ 3263 + if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) 2559 3264 goto out; 2560 3265 2561 3266 /* ··· 2741 3438 2742 3439 if (flags & ~IORING_SETUP_FLAGS) 2743 3440 return -EINVAL; 3441 + 3442 + if (flags & IORING_SETUP_SQ_REWIND) { 3443 + if ((flags & IORING_SETUP_SQPOLL) || 3444 + !(flags & IORING_SETUP_NO_SQARRAY)) 3445 + return -EINVAL; 3446 + } 2744 3447 2745 3448 /* There is no way to mmap rings without a real fd */ 2746 3449 if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && ··· 2970 3661 } 2971 3662 2972 3663 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER 2973 - && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 2974 - /* 2975 - * Unlike io_register_enable_rings(), don't need WRITE_ONCE() 2976 - * since ctx isn't yet accessible from other tasks 2977 - */ 3664 + && !(ctx->flags & IORING_SETUP_R_DISABLED)) 2978 3665 ctx->submitter_task = get_task_struct(current); 2979 - } 2980 3666 2981 3667 file = io_uring_get_file(ctx); 2982 3668 if (IS_ERR(file)) {

+12 -78

io_uring/io_uring.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef IOU_CORE_H 2 3 #define IOU_CORE_H 3 4 4 5 #include <linux/errno.h> 5 6 #include <linux/lockdep.h> 6 7 #include <linux/resume_user_mode.h> 7 - #include <linux/kasan.h> 8 8 #include <linux/poll.h> 9 9 #include <linux/io_uring_types.h> 10 10 #include <uapi/linux/eventpoll.h> 11 11 #include "alloc_cache.h" 12 12 #include "io-wq.h" 13 13 #include "slist.h" 14 + #include "tw.h" 14 15 #include "opdef.h" 15 16 16 17 #ifndef CREATE_TRACE_POINTS ··· 70 69 IORING_SETUP_NO_SQARRAY |\ 71 70 IORING_SETUP_HYBRID_IOPOLL |\ 72 71 IORING_SETUP_CQE_MIXED |\ 73 - IORING_SETUP_SQE_MIXED) 72 + IORING_SETUP_SQE_MIXED |\ 73 + IORING_SETUP_SQ_REWIND) 74 74 75 75 #define IORING_ENTER_FLAGS (IORING_ENTER_GETEVENTS |\ 76 76 IORING_ENTER_SQ_WAKEUP |\ ··· 90 88 IOSQE_ASYNC |\ 91 89 IOSQE_BUFFER_SELECT |\ 92 90 IOSQE_CQE_SKIP_SUCCESS) 91 + 92 + #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) 93 + 94 + /* 95 + * Complaint timeout for io_uring cancelation exits, and for io-wq exit 96 + * worker waiting. 97 + */ 98 + #define IO_URING_EXIT_WAIT_MAX (HZ * 60 * 5) 93 99 94 100 enum { 95 101 IOU_COMPLETE = 0, ··· 161 151 int io_prepare_config(struct io_ctx_config *config); 162 152 163 153 bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow, bool cqe32); 164 - int io_run_task_work_sig(struct io_ring_ctx *ctx); 165 - int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); 166 154 void io_req_defer_failed(struct io_kiocb *req, s32 res); 167 155 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); 168 156 void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); ··· 174 166 struct file *io_file_get_fixed(struct io_kiocb *req, int fd, 175 167 unsigned issue_flags); 176 168 177 - void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); 178 - void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); 179 169 void io_req_task_queue(struct io_kiocb *req); 180 170 void io_req_task_complete(struct io_tw_req tw_req, io_tw_token_t tw); 181 171 void io_req_task_queue_fail(struct io_kiocb *req, int ret); 182 172 void io_req_task_submit(struct io_tw_req tw_req, io_tw_token_t tw); 183 - struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 184 - struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 185 - void tctx_task_work(struct callback_head *cb); 186 173 __cold void io_uring_drop_tctx_refs(struct task_struct *task); 187 174 188 175 int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, ··· 228 225 static inline bool io_is_compat(struct io_ring_ctx *ctx) 229 226 { 230 227 return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); 231 - } 232 - 233 - static inline void io_req_task_work_add(struct io_kiocb *req) 234 - { 235 - __io_req_task_work_add(req, 0); 236 228 } 237 229 238 230 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) ··· 454 456 return min(entries, ctx->sq_entries); 455 457 } 456 458 457 - static inline int io_run_task_work(void) 458 - { 459 - bool ret = false; 460 - 461 - /* 462 - * Always check-and-clear the task_work notification signal. With how 463 - * signaling works for task_work, we can find it set with nothing to 464 - * run. We need to clear it for that case, like get_signal() does. 465 - */ 466 - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 467 - clear_notify_signal(); 468 - /* 469 - * PF_IO_WORKER never returns to userspace, so check here if we have 470 - * notify work that needs processing. 471 - */ 472 - if (current->flags & PF_IO_WORKER) { 473 - if (test_thread_flag(TIF_NOTIFY_RESUME)) { 474 - __set_current_state(TASK_RUNNING); 475 - resume_user_mode_work(NULL); 476 - } 477 - if (current->io_uring) { 478 - unsigned int count = 0; 479 - 480 - __set_current_state(TASK_RUNNING); 481 - tctx_task_work_run(current->io_uring, UINT_MAX, &count); 482 - if (count) 483 - ret = true; 484 - } 485 - } 486 - if (task_work_pending(current)) { 487 - __set_current_state(TASK_RUNNING); 488 - task_work_run(); 489 - ret = true; 490 - } 491 - 492 - return ret; 493 - } 494 - 495 - static inline bool io_local_work_pending(struct io_ring_ctx *ctx) 496 - { 497 - return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); 498 - } 499 - 500 - static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 501 - { 502 - return task_work_pending(current) || io_local_work_pending(ctx); 503 - } 504 - 505 - static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) 506 - { 507 - lockdep_assert_held(&ctx->uring_lock); 508 - } 509 - 510 459 /* 511 460 * Don't complete immediately but use deferred completion infrastructure. 512 461 * Protected by ->uring_lock and can only be used either with ··· 509 564 } 510 565 *req = io_extract_req(ctx); 511 566 return true; 512 - } 513 - 514 - static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) 515 - { 516 - return likely(ctx->submitter_task == current); 517 - } 518 - 519 - static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) 520 - { 521 - return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || 522 - ctx->submitter_task == current); 523 567 } 524 568 525 569 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)

+3 -2

io_uring/kbuf.c

··· 669 669 bl->buf_ring = br; 670 670 if (reg.flags & IOU_PBUF_RING_INC) 671 671 bl->flags |= IOBL_INC; 672 - io_buffer_add_list(ctx, bl, reg.bgid); 673 - return 0; 672 + ret = io_buffer_add_list(ctx, bl, reg.bgid); 673 + if (!ret) 674 + return 0; 674 675 fail: 675 676 io_free_region(ctx->user, &bl->region); 676 677 kfree(bl);

+1 -1

io_uring/memmap.c

··· 56 56 if (WARN_ON_ONCE(nr_pages > INT_MAX)) 57 57 return ERR_PTR(-EOVERFLOW); 58 58 59 - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); 59 + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); 60 60 if (!pages) 61 61 return ERR_PTR(-ENOMEM); 62 62

+1

io_uring/memmap.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef IO_URING_MEMMAP_H 2 3 #define IO_URING_MEMMAP_H 3 4

+1

io_uring/mock_file.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 1 2 #include <linux/device.h> 2 3 #include <linux/init.h> 3 4 #include <linux/kernel.h>

+14 -14

io_uring/msg_ring.c

··· 80 80 percpu_ref_put(&ctx->refs); 81 81 } 82 82 83 - static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, 83 + static void io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, 84 84 int res, u32 cflags, u64 user_data) 85 85 { 86 - if (!READ_ONCE(ctx->submitter_task)) { 87 - kfree_rcu(req, rcu_head); 88 - return -EOWNERDEAD; 89 - } 90 86 req->opcode = IORING_OP_NOP; 91 87 req->cqe.user_data = user_data; 92 88 io_req_set_res(req, res, cflags); ··· 91 95 req->tctx = NULL; 92 96 req->io_task_work.func = io_msg_tw_complete; 93 97 io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE); 94 - return 0; 95 98 } 96 99 97 100 static int io_msg_data_remote(struct io_ring_ctx *target_ctx, ··· 106 111 if (msg->flags & IORING_MSG_RING_FLAGS_PASS) 107 112 flags = msg->cqe_flags; 108 113 109 - return io_msg_remote_post(target_ctx, target, msg->len, flags, 110 - msg->user_data); 114 + io_msg_remote_post(target_ctx, target, msg->len, flags, msg->user_data); 115 + return 0; 111 116 } 112 117 113 118 static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, ··· 120 125 return -EINVAL; 121 126 if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) 122 127 return -EINVAL; 123 - if (target_ctx->flags & IORING_SETUP_R_DISABLED) 128 + /* 129 + * Keep IORING_SETUP_R_DISABLED check before submitter_task load 130 + * in io_msg_data_remote() -> io_req_task_work_add_remote() 131 + */ 132 + if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) 124 133 return -EBADFD; 125 134 126 135 if (io_msg_need_remote(target_ctx)) ··· 222 223 { 223 224 struct io_ring_ctx *ctx = req->file->private_data; 224 225 struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); 225 - struct task_struct *task = READ_ONCE(ctx->submitter_task); 226 - 227 - if (unlikely(!task)) 228 - return -EOWNERDEAD; 226 + struct task_struct *task = ctx->submitter_task; 229 227 230 228 init_task_work(&msg->tw, io_msg_tw_fd_complete); 231 229 if (task_work_add(task, &msg->tw, TWA_SIGNAL)) ··· 241 245 return -EINVAL; 242 246 if (target_ctx == ctx) 243 247 return -EINVAL; 244 - if (target_ctx->flags & IORING_SETUP_R_DISABLED) 248 + /* 249 + * Keep IORING_SETUP_R_DISABLED check before submitter_task load 250 + * in io_msg_fd_remote() 251 + */ 252 + if (smp_load_acquire(&target_ctx->flags) & IORING_SETUP_R_DISABLED) 245 253 return -EBADFD; 246 254 if (!msg->src_file) { 247 255 int ret = io_msg_grab_file(req, issue_flags);

+5 -1

io_uring/net.c

··· 515 515 516 516 cflags = io_put_kbufs(req, sel->val, sel->buf_list, io_bundle_nbufs(kmsg, sel->val)); 517 517 518 - if (bundle_finished || req->flags & REQ_F_BL_EMPTY) 518 + /* 519 + * Don't start new bundles if the buffer list is empty, or if the 520 + * current operation needed to go through polling to complete. 521 + */ 522 + if (bundle_finished || req->flags & (REQ_F_BL_EMPTY | REQ_F_POLLED)) 519 523 goto finish; 520 524 521 525 /*

+1

io_uring/notif.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 1 2 #include <linux/kernel.h> 2 3 #include <linux/errno.h> 3 4 #include <linux/file.h>

+1

io_uring/refs.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef IOU_REQ_REF_H 2 3 #define IOU_REQ_REF_H 3 4

+27 -15

io_uring/register.c

··· 103 103 return id; 104 104 } 105 105 106 + /* 107 + * Returns number of restrictions parsed and added on success, or < 0 for 108 + * an error. 109 + */ 106 110 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, 107 111 struct io_restriction *restrictions) 108 112 { ··· 133 129 if (res[i].register_op >= IORING_REGISTER_LAST) 134 130 goto err; 135 131 __set_bit(res[i].register_op, restrictions->register_op); 132 + restrictions->reg_registered = true; 136 133 break; 137 134 case IORING_RESTRICTION_SQE_OP: 138 135 if (res[i].sqe_op >= IORING_OP_LAST) 139 136 goto err; 140 137 __set_bit(res[i].sqe_op, restrictions->sqe_op); 138 + restrictions->op_registered = true; 141 139 break; 142 140 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: 143 141 restrictions->sqe_flags_allowed = res[i].sqe_flags; 142 + restrictions->op_registered = true; 144 143 break; 145 144 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: 146 145 restrictions->sqe_flags_required = res[i].sqe_flags; 146 + restrictions->op_registered = true; 147 147 break; 148 148 default: 149 149 goto err; 150 150 } 151 151 } 152 - 153 - ret = 0; 154 - 152 + ret = nr_args; 153 + if (!nr_args) { 154 + restrictions->op_registered = true; 155 + restrictions->reg_registered = true; 156 + } 155 157 err: 156 158 kfree(res); 157 159 return ret; ··· 173 163 return -EBADFD; 174 164 175 165 /* We allow only a single restrictions registration */ 176 - if (ctx->restrictions.registered) 166 + if (ctx->restrictions.op_registered || ctx->restrictions.reg_registered) 177 167 return -EBUSY; 178 168 179 169 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 180 170 /* Reset all restrictions if an error happened */ 181 - if (ret != 0) 171 + if (ret < 0) { 182 172 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 183 - else 184 - ctx->restrictions.registered = true; 185 - return ret; 173 + return ret; 174 + } 175 + if (ctx->restrictions.op_registered) 176 + ctx->op_restricted = 1; 177 + if (ctx->restrictions.reg_registered) 178 + ctx->reg_restricted = 1; 179 + return 0; 186 180 } 187 181 188 182 static int io_register_enable_rings(struct io_ring_ctx *ctx) ··· 194 180 if (!(ctx->flags & IORING_SETUP_R_DISABLED)) 195 181 return -EBADFD; 196 182 197 - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { 198 - WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); 183 + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER) { 184 + ctx->submitter_task = get_task_struct(current); 199 185 /* 200 186 * Lazy activation attempts would fail if it was polled before 201 187 * submitter_task is set. ··· 204 190 io_activate_pollwq(ctx); 205 191 } 206 192 207 - if (ctx->restrictions.registered) 208 - ctx->restricted = 1; 209 - 210 - ctx->flags &= ~IORING_SETUP_R_DISABLED; 193 + /* Keep submitter_task store before clearing IORING_SETUP_R_DISABLED */ 194 + smp_store_release(&ctx->flags, ctx->flags & ~IORING_SETUP_R_DISABLED); 211 195 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait)) 212 196 wake_up(&ctx->sq_data->wait); 213 197 return 0; ··· 637 625 if (ctx->submitter_task && ctx->submitter_task != current) 638 626 return -EEXIST; 639 627 640 - if (ctx->restricted) { 628 + if (ctx->reg_restricted && !(ctx->flags & IORING_SETUP_R_DISABLED)) { 641 629 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST); 642 630 if (!test_bit(opcode, ctx->restrictions.register_op)) 643 631 return -EACCES;

+1 -1

io_uring/rsrc.c

··· 1329 1329 1330 1330 int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) 1331 1331 { 1332 - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 1332 + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_NOWARN; 1333 1333 struct iovec *iov; 1334 1334 1335 1335 iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp);

+1 -1

io_uring/rsrc.h

··· 90 90 struct io_imu_folio_data *data); 91 91 92 92 static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, 93 - int index) 93 + unsigned int index) 94 94 { 95 95 if (index < data->nr) 96 96 return data->nodes[array_index_nospec(index, data->nr)];

+11 -22

io_uring/rw.c

··· 1303 1303 struct io_comp_batch *iob, unsigned int poll_flags) 1304 1304 { 1305 1305 struct io_ring_ctx *ctx = req->ctx; 1306 - u64 runtime, sleep_time; 1306 + u64 runtime, sleep_time, iopoll_start; 1307 1307 int ret; 1308 1308 1309 + iopoll_start = READ_ONCE(req->iopoll_start); 1309 1310 sleep_time = io_hybrid_iopoll_delay(ctx, req); 1310 1311 ret = io_uring_classic_poll(req, iob, poll_flags); 1311 - runtime = ktime_get_ns() - req->iopoll_start - sleep_time; 1312 + runtime = ktime_get_ns() - iopoll_start - sleep_time; 1312 1313 1313 1314 /* 1314 1315 * Use minimum sleep time if we're polling devices with different ··· 1323 1322 1324 1323 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) 1325 1324 { 1326 - struct io_wq_work_node *pos, *start, *prev; 1327 1325 unsigned int poll_flags = 0; 1328 1326 DEFINE_IO_COMP_BATCH(iob); 1327 + struct io_kiocb *req, *tmp; 1329 1328 int nr_events = 0; 1330 1329 1331 1330 /* ··· 1335 1334 if (ctx->poll_multi_queue || force_nonspin) 1336 1335 poll_flags |= BLK_POLL_ONESHOT; 1337 1336 1338 - wq_list_for_each(pos, start, &ctx->iopoll_list) { 1339 - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 1337 + list_for_each_entry(req, &ctx->iopoll_list, iopoll_node) { 1340 1338 int ret; 1341 1339 1342 1340 /* ··· 1364 1364 1365 1365 if (!rq_list_empty(&iob.req_list)) 1366 1366 iob.complete(&iob); 1367 - else if (!pos) 1368 - return 0; 1369 1367 1370 - prev = start; 1371 - wq_list_for_each_resume(pos, prev) { 1372 - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); 1373 - 1368 + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_node) { 1374 1369 /* order with io_complete_rw_iopoll(), e.g. ->result updates */ 1375 1370 if (!smp_load_acquire(&req->iopoll_completed)) 1376 - break; 1371 + continue; 1372 + list_del(&req->iopoll_node); 1373 + wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs); 1377 1374 nr_events++; 1378 1375 req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL); 1379 1376 if (req->opcode != IORING_OP_URING_CMD) 1380 1377 io_req_rw_cleanup(req, 0); 1381 1378 } 1382 - if (unlikely(!nr_events)) 1383 - return 0; 1384 - 1385 - pos = start ? start->next : ctx->iopoll_list.first; 1386 - wq_list_cut(&ctx->iopoll_list, prev, start); 1387 - 1388 - if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) 1389 - return 0; 1390 - ctx->submit_state.compl_reqs.first = pos; 1391 - __io_submit_flush_completions(ctx); 1379 + if (nr_events) 1380 + __io_submit_flush_completions(ctx); 1392 1381 return nr_events; 1393 1382 } 1394 1383

+1 -12

io_uring/slist.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 1 2 #ifndef INTERNAL_IO_SLIST_H 2 3 #define INTERNAL_IO_SLIST_H 3 4 ··· 9 8 10 9 #define wq_list_for_each(pos, prv, head) \ 11 10 for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) 12 - 13 - #define wq_list_for_each_resume(pos, prv) \ 14 - for (; pos; prv = pos, pos = (pos)->next) 15 11 16 12 #define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) 17 13 ··· 39 41 list->last->next = node; 40 42 list->last = node; 41 43 } 42 - } 43 - 44 - static inline void wq_list_add_head(struct io_wq_work_node *node, 45 - struct io_wq_work_list *list) 46 - { 47 - node->next = list->first; 48 - if (!node->next) 49 - list->last = node; 50 - WRITE_ONCE(list->first, node); 51 44 } 52 45 53 46 static inline void wq_list_cut(struct io_wq_work_list *list,

+4 -4

io_uring/sqpoll.c

··· 212 212 if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) 213 213 to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; 214 214 215 - if (to_submit || !wq_list_empty(&ctx->iopoll_list)) { 215 + if (to_submit || !list_empty(&ctx->iopoll_list)) { 216 216 const struct cred *creds = NULL; 217 217 218 218 io_sq_start_worktime(ist); ··· 221 221 creds = override_creds(ctx->sq_creds); 222 222 223 223 mutex_lock(&ctx->uring_lock); 224 - if (!wq_list_empty(&ctx->iopoll_list)) 224 + if (!list_empty(&ctx->iopoll_list)) 225 225 io_do_iopoll(ctx, true); 226 226 227 227 /* ··· 344 344 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 345 345 int ret = __io_sq_thread(ctx, sqd, cap_entries, &ist); 346 346 347 - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) 347 + if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) 348 348 sqt_spin = true; 349 349 } 350 350 if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) ··· 379 379 atomic_or(IORING_SQ_NEED_WAKEUP, 380 380 &ctx->rings->sq_flags); 381 381 if ((ctx->flags & IORING_SETUP_IOPOLL) && 382 - !wq_list_empty(&ctx->iopoll_list)) { 382 + !list_empty(&ctx->iopoll_list)) { 383 383 needs_sched = false; 384 384 break; 385 385 }

+2

io_uring/sync.c

··· 62 62 return -EINVAL; 63 63 64 64 sync->off = READ_ONCE(sqe->off); 65 + if (sync->off < 0) 66 + return -EINVAL; 65 67 sync->len = READ_ONCE(sqe->len); 66 68 req->flags |= REQ_F_FORCE_ASYNC; 67 69 return 0;

+11

io_uring/tctx.c

··· 122 122 return ret; 123 123 } 124 124 } 125 + 126 + /* 127 + * Re-activate io-wq keepalive on any new io_uring usage. The wq may have 128 + * been marked for idle-exit when the task temporarily had no active 129 + * io_uring instances. 130 + */ 131 + if (tctx->io_wq) 132 + io_wq_set_exit_on_idle(tctx->io_wq, false); 125 133 if (!xa_load(&tctx->xa, (unsigned long)ctx)) { 126 134 node = kmalloc(sizeof(*node), GFP_KERNEL); 127 135 if (!node) ··· 191 183 if (tctx->last == node->ctx) 192 184 tctx->last = NULL; 193 185 kfree(node); 186 + 187 + if (xa_empty(&tctx->xa) && tctx->io_wq) 188 + io_wq_set_exit_on_idle(tctx->io_wq, true); 194 189 } 195 190 196 191 __cold void io_uring_clean_tctx(struct io_uring_task *tctx)

+1 -1

io_uring/timeout.c

··· 130 130 u32 seq; 131 131 132 132 raw_spin_lock_irq(&ctx->timeout_lock); 133 - seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); 133 + seq = READ_ONCE(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); 134 134 135 135 list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { 136 136 struct io_kiocb *req = cmd_to_io_kiocb(timeout);

+355

io_uring/tw.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Task work handling for io_uring 4 + */ 5 + #include <linux/kernel.h> 6 + #include <linux/errno.h> 7 + #include <linux/sched/signal.h> 8 + #include <linux/io_uring.h> 9 + #include <linux/indirect_call_wrapper.h> 10 + 11 + #include "io_uring.h" 12 + #include "tctx.h" 13 + #include "poll.h" 14 + #include "rw.h" 15 + #include "eventfd.h" 16 + #include "wait.h" 17 + 18 + void io_fallback_req_func(struct work_struct *work) 19 + { 20 + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, 21 + fallback_work.work); 22 + struct llist_node *node = llist_del_all(&ctx->fallback_llist); 23 + struct io_kiocb *req, *tmp; 24 + struct io_tw_state ts = {}; 25 + 26 + percpu_ref_get(&ctx->refs); 27 + mutex_lock(&ctx->uring_lock); 28 + ts.cancel = io_should_terminate_tw(ctx); 29 + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) 30 + req->io_task_work.func((struct io_tw_req){req}, ts); 31 + io_submit_flush_completions(ctx); 32 + mutex_unlock(&ctx->uring_lock); 33 + percpu_ref_put(&ctx->refs); 34 + } 35 + 36 + static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) 37 + { 38 + if (!ctx) 39 + return; 40 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 41 + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 42 + 43 + io_submit_flush_completions(ctx); 44 + mutex_unlock(&ctx->uring_lock); 45 + percpu_ref_put(&ctx->refs); 46 + } 47 + 48 + /* 49 + * Run queued task_work, returning the number of entries processed in *count. 50 + * If more entries than max_entries are available, stop processing once this 51 + * is reached and return the rest of the list. 52 + */ 53 + struct llist_node *io_handle_tw_list(struct llist_node *node, 54 + unsigned int *count, 55 + unsigned int max_entries) 56 + { 57 + struct io_ring_ctx *ctx = NULL; 58 + struct io_tw_state ts = { }; 59 + 60 + do { 61 + struct llist_node *next = node->next; 62 + struct io_kiocb *req = container_of(node, struct io_kiocb, 63 + io_task_work.node); 64 + 65 + if (req->ctx != ctx) { 66 + ctx_flush_and_put(ctx, ts); 67 + ctx = req->ctx; 68 + mutex_lock(&ctx->uring_lock); 69 + percpu_ref_get(&ctx->refs); 70 + ts.cancel = io_should_terminate_tw(ctx); 71 + } 72 + INDIRECT_CALL_2(req->io_task_work.func, 73 + io_poll_task_func, io_req_rw_complete, 74 + (struct io_tw_req){req}, ts); 75 + node = next; 76 + (*count)++; 77 + if (unlikely(need_resched())) { 78 + ctx_flush_and_put(ctx, ts); 79 + ctx = NULL; 80 + cond_resched(); 81 + } 82 + } while (node && *count < max_entries); 83 + 84 + ctx_flush_and_put(ctx, ts); 85 + return node; 86 + } 87 + 88 + static __cold void __io_fallback_tw(struct llist_node *node, bool sync) 89 + { 90 + struct io_ring_ctx *last_ctx = NULL; 91 + struct io_kiocb *req; 92 + 93 + while (node) { 94 + req = container_of(node, struct io_kiocb, io_task_work.node); 95 + node = node->next; 96 + if (last_ctx != req->ctx) { 97 + if (last_ctx) { 98 + if (sync) 99 + flush_delayed_work(&last_ctx->fallback_work); 100 + percpu_ref_put(&last_ctx->refs); 101 + } 102 + last_ctx = req->ctx; 103 + percpu_ref_get(&last_ctx->refs); 104 + } 105 + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) 106 + schedule_delayed_work(&last_ctx->fallback_work, 1); 107 + } 108 + 109 + if (last_ctx) { 110 + if (sync) 111 + flush_delayed_work(&last_ctx->fallback_work); 112 + percpu_ref_put(&last_ctx->refs); 113 + } 114 + } 115 + 116 + static void io_fallback_tw(struct io_uring_task *tctx, bool sync) 117 + { 118 + struct llist_node *node = llist_del_all(&tctx->task_list); 119 + 120 + __io_fallback_tw(node, sync); 121 + } 122 + 123 + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, 124 + unsigned int max_entries, 125 + unsigned int *count) 126 + { 127 + struct llist_node *node; 128 + 129 + node = llist_del_all(&tctx->task_list); 130 + if (node) { 131 + node = llist_reverse_order(node); 132 + node = io_handle_tw_list(node, count, max_entries); 133 + } 134 + 135 + /* relaxed read is enough as only the task itself sets ->in_cancel */ 136 + if (unlikely(atomic_read(&tctx->in_cancel))) 137 + io_uring_drop_tctx_refs(current); 138 + 139 + trace_io_uring_task_work_run(tctx, *count); 140 + return node; 141 + } 142 + 143 + void tctx_task_work(struct callback_head *cb) 144 + { 145 + struct io_uring_task *tctx; 146 + struct llist_node *ret; 147 + unsigned int count = 0; 148 + 149 + tctx = container_of(cb, struct io_uring_task, task_work); 150 + ret = tctx_task_work_run(tctx, UINT_MAX, &count); 151 + /* can't happen */ 152 + WARN_ON_ONCE(ret); 153 + } 154 + 155 + void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 156 + { 157 + struct io_ring_ctx *ctx = req->ctx; 158 + unsigned nr_wait, nr_tw, nr_tw_prev; 159 + struct llist_node *head; 160 + 161 + /* See comment above IO_CQ_WAKE_INIT */ 162 + BUILD_BUG_ON(IO_CQ_WAKE_FORCE <= IORING_MAX_CQ_ENTRIES); 163 + 164 + /* 165 + * We don't know how many requests there are in the link and whether 166 + * they can even be queued lazily, fall back to non-lazy. 167 + */ 168 + if (req->flags & IO_REQ_LINK_FLAGS) 169 + flags &= ~IOU_F_TWQ_LAZY_WAKE; 170 + 171 + guard(rcu)(); 172 + 173 + head = READ_ONCE(ctx->work_llist.first); 174 + do { 175 + nr_tw_prev = 0; 176 + if (head) { 177 + struct io_kiocb *first_req = container_of(head, 178 + struct io_kiocb, 179 + io_task_work.node); 180 + /* 181 + * Might be executed at any moment, rely on 182 + * SLAB_TYPESAFE_BY_RCU to keep it alive. 183 + */ 184 + nr_tw_prev = READ_ONCE(first_req->nr_tw); 185 + } 186 + 187 + /* 188 + * Theoretically, it can overflow, but that's fine as one of 189 + * previous adds should've tried to wake the task. 190 + */ 191 + nr_tw = nr_tw_prev + 1; 192 + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) 193 + nr_tw = IO_CQ_WAKE_FORCE; 194 + 195 + req->nr_tw = nr_tw; 196 + req->io_task_work.node.next = head; 197 + } while (!try_cmpxchg(&ctx->work_llist.first, &head, 198 + &req->io_task_work.node)); 199 + 200 + /* 201 + * cmpxchg implies a full barrier, which pairs with the barrier 202 + * in set_current_state() on the io_cqring_wait() side. It's used 203 + * to ensure that either we see updated ->cq_wait_nr, or waiters 204 + * going to sleep will observe the work added to the list, which 205 + * is similar to the wait/wawke task state sync. 206 + */ 207 + 208 + if (!head) { 209 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 210 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 211 + if (ctx->has_evfd) 212 + io_eventfd_signal(ctx, false); 213 + } 214 + 215 + nr_wait = atomic_read(&ctx->cq_wait_nr); 216 + /* not enough or no one is waiting */ 217 + if (nr_tw < nr_wait) 218 + return; 219 + /* the previous add has already woken it up */ 220 + if (nr_tw_prev >= nr_wait) 221 + return; 222 + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); 223 + } 224 + 225 + void io_req_normal_work_add(struct io_kiocb *req) 226 + { 227 + struct io_uring_task *tctx = req->tctx; 228 + struct io_ring_ctx *ctx = req->ctx; 229 + 230 + /* task_work already pending, we're done */ 231 + if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 232 + return; 233 + 234 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 235 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 236 + 237 + /* SQPOLL doesn't need the task_work added, it'll run it itself */ 238 + if (ctx->flags & IORING_SETUP_SQPOLL) { 239 + __set_notify_signal(tctx->task); 240 + return; 241 + } 242 + 243 + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) 244 + return; 245 + 246 + io_fallback_tw(tctx, false); 247 + } 248 + 249 + void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) 250 + { 251 + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) 252 + return; 253 + __io_req_task_work_add(req, flags); 254 + } 255 + 256 + void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 257 + { 258 + struct llist_node *node = llist_del_all(&ctx->work_llist); 259 + 260 + __io_fallback_tw(node, false); 261 + node = llist_del_all(&ctx->retry_llist); 262 + __io_fallback_tw(node, false); 263 + } 264 + 265 + static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, 266 + int min_events) 267 + { 268 + if (!io_local_work_pending(ctx)) 269 + return false; 270 + if (events < min_events) 271 + return true; 272 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 273 + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 274 + return false; 275 + } 276 + 277 + static int __io_run_local_work_loop(struct llist_node **node, 278 + io_tw_token_t tw, 279 + int events) 280 + { 281 + int ret = 0; 282 + 283 + while (*node) { 284 + struct llist_node *next = (*node)->next; 285 + struct io_kiocb *req = container_of(*node, struct io_kiocb, 286 + io_task_work.node); 287 + INDIRECT_CALL_2(req->io_task_work.func, 288 + io_poll_task_func, io_req_rw_complete, 289 + (struct io_tw_req){req}, tw); 290 + *node = next; 291 + if (++ret >= events) 292 + break; 293 + } 294 + 295 + return ret; 296 + } 297 + 298 + static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, 299 + int min_events, int max_events) 300 + { 301 + struct llist_node *node; 302 + unsigned int loops = 0; 303 + int ret = 0; 304 + 305 + if (WARN_ON_ONCE(ctx->submitter_task != current)) 306 + return -EEXIST; 307 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 308 + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 309 + again: 310 + tw.cancel = io_should_terminate_tw(ctx); 311 + min_events -= ret; 312 + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); 313 + if (ctx->retry_llist.first) 314 + goto retry_done; 315 + 316 + /* 317 + * llists are in reverse order, flip it back the right way before 318 + * running the pending items. 319 + */ 320 + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); 321 + ret += __io_run_local_work_loop(&node, tw, max_events - ret); 322 + ctx->retry_llist.first = node; 323 + loops++; 324 + 325 + if (io_run_local_work_continue(ctx, ret, min_events)) 326 + goto again; 327 + retry_done: 328 + io_submit_flush_completions(ctx); 329 + if (io_run_local_work_continue(ctx, ret, min_events)) 330 + goto again; 331 + 332 + trace_io_uring_local_work_run(ctx, ret, loops); 333 + return ret; 334 + } 335 + 336 + int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) 337 + { 338 + struct io_tw_state ts = {}; 339 + 340 + if (!io_local_work_pending(ctx)) 341 + return 0; 342 + return __io_run_local_work(ctx, ts, min_events, 343 + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 344 + } 345 + 346 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events) 347 + { 348 + struct io_tw_state ts = {}; 349 + int ret; 350 + 351 + mutex_lock(&ctx->uring_lock); 352 + ret = __io_run_local_work(ctx, ts, min_events, max_events); 353 + mutex_unlock(&ctx->uring_lock); 354 + return ret; 355 + }

+116

io_uring/tw.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IOU_TW_H 3 + #define IOU_TW_H 4 + 5 + #include <linux/sched.h> 6 + #include <linux/percpu-refcount.h> 7 + #include <linux/io_uring_types.h> 8 + 9 + #define IO_LOCAL_TW_DEFAULT_MAX 20 10 + 11 + /* 12 + * Terminate the request if either of these conditions are true: 13 + * 14 + * 1) It's being executed by the original task, but that task is marked 15 + * with PF_EXITING as it's exiting. 16 + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is 17 + * our fallback task_work. 18 + * 3) The ring has been closed and is going away. 19 + */ 20 + static inline bool io_should_terminate_tw(struct io_ring_ctx *ctx) 21 + { 22 + return (current->flags & (PF_EXITING | PF_KTHREAD)) || percpu_ref_is_dying(&ctx->refs); 23 + } 24 + 25 + void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); 26 + struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); 27 + void tctx_task_work(struct callback_head *cb); 28 + int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int max_events); 29 + int io_run_task_work_sig(struct io_ring_ctx *ctx); 30 + 31 + __cold void io_fallback_req_func(struct work_struct *work); 32 + __cold void io_move_task_work_from_local(struct io_ring_ctx *ctx); 33 + int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events); 34 + 35 + void io_req_local_work_add(struct io_kiocb *req, unsigned flags); 36 + void io_req_normal_work_add(struct io_kiocb *req); 37 + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); 38 + 39 + static inline void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) 40 + { 41 + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) 42 + io_req_local_work_add(req, flags); 43 + else 44 + io_req_normal_work_add(req); 45 + } 46 + 47 + static inline void io_req_task_work_add(struct io_kiocb *req) 48 + { 49 + __io_req_task_work_add(req, 0); 50 + } 51 + 52 + static inline int io_run_task_work(void) 53 + { 54 + bool ret = false; 55 + 56 + /* 57 + * Always check-and-clear the task_work notification signal. With how 58 + * signaling works for task_work, we can find it set with nothing to 59 + * run. We need to clear it for that case, like get_signal() does. 60 + */ 61 + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) 62 + clear_notify_signal(); 63 + /* 64 + * PF_IO_WORKER never returns to userspace, so check here if we have 65 + * notify work that needs processing. 66 + */ 67 + if (current->flags & PF_IO_WORKER) { 68 + if (test_thread_flag(TIF_NOTIFY_RESUME)) { 69 + __set_current_state(TASK_RUNNING); 70 + resume_user_mode_work(NULL); 71 + } 72 + if (current->io_uring) { 73 + unsigned int count = 0; 74 + 75 + __set_current_state(TASK_RUNNING); 76 + tctx_task_work_run(current->io_uring, UINT_MAX, &count); 77 + if (count) 78 + ret = true; 79 + } 80 + } 81 + if (task_work_pending(current)) { 82 + __set_current_state(TASK_RUNNING); 83 + task_work_run(); 84 + ret = true; 85 + } 86 + 87 + return ret; 88 + } 89 + 90 + static inline bool io_local_work_pending(struct io_ring_ctx *ctx) 91 + { 92 + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); 93 + } 94 + 95 + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) 96 + { 97 + return task_work_pending(current) || io_local_work_pending(ctx); 98 + } 99 + 100 + static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) 101 + { 102 + lockdep_assert_held(&ctx->uring_lock); 103 + } 104 + 105 + static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) 106 + { 107 + return likely(ctx->submitter_task == current); 108 + } 109 + 110 + static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) 111 + { 112 + return likely(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN) || 113 + ctx->submitter_task == current); 114 + } 115 + 116 + #endif

+9

io_uring/uring_cmd.c

··· 104 104 struct io_kiocb *req = cmd_to_io_kiocb(cmd); 105 105 struct io_ring_ctx *ctx = req->ctx; 106 106 107 + /* 108 + * Doing cancelations on IOPOLL requests are not supported. Both 109 + * because they can't get canceled in the block stack, but also 110 + * because iopoll completion data overlaps with the hash_node used 111 + * for tracking. 112 + */ 113 + if (ctx->flags & IORING_SETUP_IOPOLL) 114 + return; 115 + 107 116 if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { 108 117 cmd->flags |= IORING_URING_CMD_CANCELABLE; 109 118 io_ring_submit_lock(ctx, issue_flags);

+308

io_uring/wait.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Waiting for completion events 4 + */ 5 + #include <linux/kernel.h> 6 + #include <linux/sched/signal.h> 7 + #include <linux/io_uring.h> 8 + 9 + #include <trace/events/io_uring.h> 10 + 11 + #include <uapi/linux/io_uring.h> 12 + 13 + #include "io_uring.h" 14 + #include "napi.h" 15 + #include "wait.h" 16 + 17 + static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 18 + int wake_flags, void *key) 19 + { 20 + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); 21 + 22 + /* 23 + * Cannot safely flush overflowed CQEs from here, ensure we wake up 24 + * the task, and the next invocation will do it. 25 + */ 26 + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 27 + return autoremove_wake_function(curr, mode, wake_flags, key); 28 + return -1; 29 + } 30 + 31 + int io_run_task_work_sig(struct io_ring_ctx *ctx) 32 + { 33 + if (io_local_work_pending(ctx)) { 34 + __set_current_state(TASK_RUNNING); 35 + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) 36 + return 0; 37 + } 38 + if (io_run_task_work() > 0) 39 + return 0; 40 + if (task_sigpending(current)) 41 + return -EINTR; 42 + return 0; 43 + } 44 + 45 + static bool current_pending_io(void) 46 + { 47 + struct io_uring_task *tctx = current->io_uring; 48 + 49 + if (!tctx) 50 + return false; 51 + return percpu_counter_read_positive(&tctx->inflight); 52 + } 53 + 54 + static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) 55 + { 56 + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 57 + 58 + WRITE_ONCE(iowq->hit_timeout, 1); 59 + iowq->min_timeout = 0; 60 + wake_up_process(iowq->wq.private); 61 + return HRTIMER_NORESTART; 62 + } 63 + 64 + /* 65 + * Doing min_timeout portion. If we saw any timeouts, events, or have work, 66 + * wake up. If not, and we have a normal timeout, switch to that and keep 67 + * sleeping. 68 + */ 69 + static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) 70 + { 71 + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 72 + struct io_ring_ctx *ctx = iowq->ctx; 73 + 74 + /* no general timeout, or shorter (or equal), we are done */ 75 + if (iowq->timeout == KTIME_MAX || 76 + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) 77 + goto out_wake; 78 + /* work we may need to run, wake function will see if we need to wake */ 79 + if (io_has_work(ctx)) 80 + goto out_wake; 81 + /* got events since we started waiting, min timeout is done */ 82 + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) 83 + goto out_wake; 84 + /* if we have any events and min timeout expired, we're done */ 85 + if (io_cqring_events(ctx)) 86 + goto out_wake; 87 + 88 + /* 89 + * If using deferred task_work running and application is waiting on 90 + * more than one request, ensure we reset it now where we are switching 91 + * to normal sleeps. Any request completion post min_wait should wake 92 + * the task and return. 93 + */ 94 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 95 + atomic_set(&ctx->cq_wait_nr, 1); 96 + smp_mb(); 97 + if (!llist_empty(&ctx->work_llist)) 98 + goto out_wake; 99 + } 100 + 101 + /* any generated CQE posted past this time should wake us up */ 102 + iowq->cq_tail = iowq->cq_min_tail; 103 + 104 + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); 105 + hrtimer_set_expires(timer, iowq->timeout); 106 + return HRTIMER_RESTART; 107 + out_wake: 108 + return io_cqring_timer_wakeup(timer); 109 + } 110 + 111 + static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, 112 + clockid_t clock_id, ktime_t start_time) 113 + { 114 + ktime_t timeout; 115 + 116 + if (iowq->min_timeout) { 117 + timeout = ktime_add_ns(iowq->min_timeout, start_time); 118 + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, 119 + HRTIMER_MODE_ABS); 120 + } else { 121 + timeout = iowq->timeout; 122 + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, 123 + HRTIMER_MODE_ABS); 124 + } 125 + 126 + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); 127 + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); 128 + 129 + if (!READ_ONCE(iowq->hit_timeout)) 130 + schedule(); 131 + 132 + hrtimer_cancel(&iowq->t); 133 + destroy_hrtimer_on_stack(&iowq->t); 134 + __set_current_state(TASK_RUNNING); 135 + 136 + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; 137 + } 138 + 139 + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, 140 + struct io_wait_queue *iowq, 141 + struct ext_arg *ext_arg, 142 + ktime_t start_time) 143 + { 144 + int ret = 0; 145 + 146 + /* 147 + * Mark us as being in io_wait if we have pending requests, so cpufreq 148 + * can take into account that the task is waiting for IO - turns out 149 + * to be important for low QD IO. 150 + */ 151 + if (ext_arg->iowait && current_pending_io()) 152 + current->in_iowait = 1; 153 + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) 154 + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); 155 + else 156 + schedule(); 157 + current->in_iowait = 0; 158 + return ret; 159 + } 160 + 161 + /* If this returns > 0, the caller should retry */ 162 + static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 163 + struct io_wait_queue *iowq, 164 + struct ext_arg *ext_arg, 165 + ktime_t start_time) 166 + { 167 + if (unlikely(READ_ONCE(ctx->check_cq))) 168 + return 1; 169 + if (unlikely(io_local_work_pending(ctx))) 170 + return 1; 171 + if (unlikely(task_work_pending(current))) 172 + return 1; 173 + if (unlikely(task_sigpending(current))) 174 + return -EINTR; 175 + if (unlikely(io_should_wake(iowq))) 176 + return 0; 177 + 178 + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); 179 + } 180 + 181 + /* 182 + * Wait until events become available, if we don't already have some. The 183 + * application must reap them itself, as they reside on the shared cq ring. 184 + */ 185 + int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 186 + struct ext_arg *ext_arg) 187 + { 188 + struct io_wait_queue iowq; 189 + struct io_rings *rings = ctx->rings; 190 + ktime_t start_time; 191 + int ret; 192 + 193 + min_events = min_t(int, min_events, ctx->cq_entries); 194 + 195 + if (!io_allowed_run_tw(ctx)) 196 + return -EEXIST; 197 + if (io_local_work_pending(ctx)) 198 + io_run_local_work(ctx, min_events, 199 + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 200 + io_run_task_work(); 201 + 202 + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) 203 + io_cqring_do_overflow_flush(ctx); 204 + if (__io_cqring_events_user(ctx) >= min_events) 205 + return 0; 206 + 207 + init_waitqueue_func_entry(&iowq.wq, io_wake_function); 208 + iowq.wq.private = current; 209 + INIT_LIST_HEAD(&iowq.wq.entry); 210 + iowq.ctx = ctx; 211 + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 212 + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); 213 + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 214 + iowq.hit_timeout = 0; 215 + iowq.min_timeout = ext_arg->min_time; 216 + iowq.timeout = KTIME_MAX; 217 + start_time = io_get_time(ctx); 218 + 219 + if (ext_arg->ts_set) { 220 + iowq.timeout = timespec64_to_ktime(ext_arg->ts); 221 + if (!(flags & IORING_ENTER_ABS_TIMER)) 222 + iowq.timeout = ktime_add(iowq.timeout, start_time); 223 + } 224 + 225 + if (ext_arg->sig) { 226 + #ifdef CONFIG_COMPAT 227 + if (in_compat_syscall()) 228 + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, 229 + ext_arg->argsz); 230 + else 231 + #endif 232 + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); 233 + 234 + if (ret) 235 + return ret; 236 + } 237 + 238 + io_napi_busy_loop(ctx, &iowq); 239 + 240 + trace_io_uring_cqring_wait(ctx, min_events); 241 + do { 242 + unsigned long check_cq; 243 + int nr_wait; 244 + 245 + /* if min timeout has been hit, don't reset wait count */ 246 + if (!iowq.hit_timeout) 247 + nr_wait = (int) iowq.cq_tail - 248 + READ_ONCE(ctx->rings->cq.tail); 249 + else 250 + nr_wait = 1; 251 + 252 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 253 + atomic_set(&ctx->cq_wait_nr, nr_wait); 254 + set_current_state(TASK_INTERRUPTIBLE); 255 + } else { 256 + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 257 + TASK_INTERRUPTIBLE); 258 + } 259 + 260 + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); 261 + __set_current_state(TASK_RUNNING); 262 + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 263 + 264 + /* 265 + * Run task_work after scheduling and before io_should_wake(). 266 + * If we got woken because of task_work being processed, run it 267 + * now rather than let the caller do another wait loop. 268 + */ 269 + if (io_local_work_pending(ctx)) 270 + io_run_local_work(ctx, nr_wait, nr_wait); 271 + io_run_task_work(); 272 + 273 + /* 274 + * Non-local task_work will be run on exit to userspace, but 275 + * if we're using DEFER_TASKRUN, then we could have waited 276 + * with a timeout for a number of requests. If the timeout 277 + * hits, we could have some requests ready to process. Ensure 278 + * this break is _after_ we have run task_work, to avoid 279 + * deferring running potentially pending requests until the 280 + * next time we wait for events. 281 + */ 282 + if (ret < 0) 283 + break; 284 + 285 + check_cq = READ_ONCE(ctx->check_cq); 286 + if (unlikely(check_cq)) { 287 + /* let the caller flush overflows, retry */ 288 + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 289 + io_cqring_do_overflow_flush(ctx); 290 + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { 291 + ret = -EBADR; 292 + break; 293 + } 294 + } 295 + 296 + if (io_should_wake(&iowq)) { 297 + ret = 0; 298 + break; 299 + } 300 + cond_resched(); 301 + } while (1); 302 + 303 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 304 + finish_wait(&ctx->cq_wait, &iowq.wq); 305 + restore_saved_sigmask_unless(ret == -EINTR); 306 + 307 + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 308 + }

+49

io_uring/wait.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IOU_WAIT_H 3 + #define IOU_WAIT_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + /* 8 + * No waiters. It's larger than any valid value of the tw counter 9 + * so that tests against ->cq_wait_nr would fail and skip wake_up(). 10 + */ 11 + #define IO_CQ_WAKE_INIT (-1U) 12 + /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 13 + #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 14 + 15 + struct ext_arg { 16 + size_t argsz; 17 + struct timespec64 ts; 18 + const sigset_t __user *sig; 19 + ktime_t min_time; 20 + bool ts_set; 21 + bool iowait; 22 + }; 23 + 24 + int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 25 + struct ext_arg *ext_arg); 26 + int io_run_task_work_sig(struct io_ring_ctx *ctx); 27 + void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); 28 + 29 + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 30 + { 31 + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 32 + } 33 + 34 + static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) 35 + { 36 + return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 37 + } 38 + 39 + /* 40 + * Reads the tail/head of the CQ ring while providing an acquire ordering, 41 + * see comment at top of io_uring.c. 42 + */ 43 + static inline unsigned io_cqring_events(struct io_ring_ctx *ctx) 44 + { 45 + smp_rmb(); 46 + return __io_cqring_events(ctx); 47 + } 48 + 49 + #endif

Configure Feed

Configure Feed