io_uring: split out CQ waiting code into wait.c

+7 -6

io_uring/Makefile

··· 8 8 9 9 obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ 10 10 tctx.o filetable.o rw.o poll.o \ 11 - tw.o eventfd.o uring_cmd.o openclose.o \ 12 - sqpoll.o xattr.o nop.o fs.o splice.o \ 13 - sync.o msg_ring.o advise.o openclose.o \ 14 - statx.o timeout.o cancel.o \ 15 - waitid.o register.o truncate.o \ 16 - memmap.o alloc_cache.o query.o 11 + tw.o wait.o eventfd.o uring_cmd.o \ 12 + openclose.o sqpoll.o xattr.o nop.o \ 13 + fs.o splice.o sync.o msg_ring.o \ 14 + advise.o openclose.o statx.o timeout.o \ 15 + cancel.o waitid.o register.o \ 16 + truncate.o memmap.o alloc_cache.o \ 17 + query.o 17 18 18 19 obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o 19 20 obj-$(CONFIG_IO_WQ) += io-wq.o

+1

io_uring/cancel.c

··· 19 19 #include "waitid.h" 20 20 #include "futex.h" 21 21 #include "cancel.h" 22 + #include "wait.h" 22 23 23 24 struct io_cancel { 24 25 struct file *file;

+2 -320

io_uring/io_uring.c

··· 93 93 #include "rw.h" 94 94 #include "alloc_cache.h" 95 95 #include "eventfd.h" 96 + #include "wait.h" 96 97 97 98 #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ 98 99 IOSQE_IO_HARDLINK | IOSQE_ASYNC) ··· 165 164 req->comp_list.next = IO_URING_PTR_POISON; 166 165 req->file_node = IO_URING_PTR_POISON; 167 166 req->link = IO_URING_PTR_POISON; 168 - } 169 - 170 - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 171 - { 172 - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 173 - } 174 - 175 - static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) 176 - { 177 - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 178 167 } 179 168 180 169 static inline void req_fail_link_node(struct io_kiocb *req, int res) ··· 580 589 __io_cqring_overflow_flush(ctx, true); 581 590 } 582 591 583 - static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 592 + void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) 584 593 { 585 594 mutex_lock(&ctx->uring_lock); 586 595 __io_cqring_overflow_flush(ctx, false); ··· 1150 1159 io_queue_deferred(ctx); 1151 1160 1152 1161 ctx->submit_state.cq_flush = false; 1153 - } 1154 - 1155 - static unsigned io_cqring_events(struct io_ring_ctx *ctx) 1156 - { 1157 - /* See comment at the top of this file */ 1158 - smp_rmb(); 1159 - return __io_cqring_events(ctx); 1160 1162 } 1161 1163 1162 1164 /* ··· 2042 2058 /* Commit SQ ring head once we've consumed and submitted all SQEs */ 2043 2059 io_commit_sqring(ctx); 2044 2060 return ret; 2045 - } 2046 - 2047 - static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 2048 - int wake_flags, void *key) 2049 - { 2050 - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); 2051 - 2052 - /* 2053 - * Cannot safely flush overflowed CQEs from here, ensure we wake up 2054 - * the task, and the next invocation will do it. 2055 - */ 2056 - if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 2057 - return autoremove_wake_function(curr, mode, wake_flags, key); 2058 - return -1; 2059 - } 2060 - 2061 - int io_run_task_work_sig(struct io_ring_ctx *ctx) 2062 - { 2063 - if (io_local_work_pending(ctx)) { 2064 - __set_current_state(TASK_RUNNING); 2065 - if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) 2066 - return 0; 2067 - } 2068 - if (io_run_task_work() > 0) 2069 - return 0; 2070 - if (task_sigpending(current)) 2071 - return -EINTR; 2072 - return 0; 2073 - } 2074 - 2075 - static bool current_pending_io(void) 2076 - { 2077 - struct io_uring_task *tctx = current->io_uring; 2078 - 2079 - if (!tctx) 2080 - return false; 2081 - return percpu_counter_read_positive(&tctx->inflight); 2082 - } 2083 - 2084 - static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) 2085 - { 2086 - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 2087 - 2088 - WRITE_ONCE(iowq->hit_timeout, 1); 2089 - iowq->min_timeout = 0; 2090 - wake_up_process(iowq->wq.private); 2091 - return HRTIMER_NORESTART; 2092 - } 2093 - 2094 - /* 2095 - * Doing min_timeout portion. If we saw any timeouts, events, or have work, 2096 - * wake up. If not, and we have a normal timeout, switch to that and keep 2097 - * sleeping. 2098 - */ 2099 - static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) 2100 - { 2101 - struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 2102 - struct io_ring_ctx *ctx = iowq->ctx; 2103 - 2104 - /* no general timeout, or shorter (or equal), we are done */ 2105 - if (iowq->timeout == KTIME_MAX || 2106 - ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) 2107 - goto out_wake; 2108 - /* work we may need to run, wake function will see if we need to wake */ 2109 - if (io_has_work(ctx)) 2110 - goto out_wake; 2111 - /* got events since we started waiting, min timeout is done */ 2112 - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) 2113 - goto out_wake; 2114 - /* if we have any events and min timeout expired, we're done */ 2115 - if (io_cqring_events(ctx)) 2116 - goto out_wake; 2117 - 2118 - /* 2119 - * If using deferred task_work running and application is waiting on 2120 - * more than one request, ensure we reset it now where we are switching 2121 - * to normal sleeps. Any request completion post min_wait should wake 2122 - * the task and return. 2123 - */ 2124 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2125 - atomic_set(&ctx->cq_wait_nr, 1); 2126 - smp_mb(); 2127 - if (!llist_empty(&ctx->work_llist)) 2128 - goto out_wake; 2129 - } 2130 - 2131 - /* any generated CQE posted past this time should wake us up */ 2132 - iowq->cq_tail = iowq->cq_min_tail; 2133 - 2134 - hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); 2135 - hrtimer_set_expires(timer, iowq->timeout); 2136 - return HRTIMER_RESTART; 2137 - out_wake: 2138 - return io_cqring_timer_wakeup(timer); 2139 - } 2140 - 2141 - static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, 2142 - clockid_t clock_id, ktime_t start_time) 2143 - { 2144 - ktime_t timeout; 2145 - 2146 - if (iowq->min_timeout) { 2147 - timeout = ktime_add_ns(iowq->min_timeout, start_time); 2148 - hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, 2149 - HRTIMER_MODE_ABS); 2150 - } else { 2151 - timeout = iowq->timeout; 2152 - hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, 2153 - HRTIMER_MODE_ABS); 2154 - } 2155 - 2156 - hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); 2157 - hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); 2158 - 2159 - if (!READ_ONCE(iowq->hit_timeout)) 2160 - schedule(); 2161 - 2162 - hrtimer_cancel(&iowq->t); 2163 - destroy_hrtimer_on_stack(&iowq->t); 2164 - __set_current_state(TASK_RUNNING); 2165 - 2166 - return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; 2167 - } 2168 - 2169 - struct ext_arg { 2170 - size_t argsz; 2171 - struct timespec64 ts; 2172 - const sigset_t __user *sig; 2173 - ktime_t min_time; 2174 - bool ts_set; 2175 - bool iowait; 2176 - }; 2177 - 2178 - static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2179 - struct io_wait_queue *iowq, 2180 - struct ext_arg *ext_arg, 2181 - ktime_t start_time) 2182 - { 2183 - int ret = 0; 2184 - 2185 - /* 2186 - * Mark us as being in io_wait if we have pending requests, so cpufreq 2187 - * can take into account that the task is waiting for IO - turns out 2188 - * to be important for low QD IO. 2189 - */ 2190 - if (ext_arg->iowait && current_pending_io()) 2191 - current->in_iowait = 1; 2192 - if (iowq->timeout != KTIME_MAX || iowq->min_timeout) 2193 - ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); 2194 - else 2195 - schedule(); 2196 - current->in_iowait = 0; 2197 - return ret; 2198 - } 2199 - 2200 - /* If this returns > 0, the caller should retry */ 2201 - static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2202 - struct io_wait_queue *iowq, 2203 - struct ext_arg *ext_arg, 2204 - ktime_t start_time) 2205 - { 2206 - if (unlikely(READ_ONCE(ctx->check_cq))) 2207 - return 1; 2208 - if (unlikely(io_local_work_pending(ctx))) 2209 - return 1; 2210 - if (unlikely(task_work_pending(current))) 2211 - return 1; 2212 - if (unlikely(task_sigpending(current))) 2213 - return -EINTR; 2214 - if (unlikely(io_should_wake(iowq))) 2215 - return 0; 2216 - 2217 - return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); 2218 - } 2219 - 2220 - /* 2221 - * Wait until events become available, if we don't already have some. The 2222 - * application must reap them itself, as they reside on the shared cq ring. 2223 - */ 2224 - static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 2225 - struct ext_arg *ext_arg) 2226 - { 2227 - struct io_wait_queue iowq; 2228 - struct io_rings *rings = ctx->rings; 2229 - ktime_t start_time; 2230 - int ret; 2231 - 2232 - min_events = min_t(int, min_events, ctx->cq_entries); 2233 - 2234 - if (!io_allowed_run_tw(ctx)) 2235 - return -EEXIST; 2236 - if (io_local_work_pending(ctx)) 2237 - io_run_local_work(ctx, min_events, 2238 - max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 2239 - io_run_task_work(); 2240 - 2241 - if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) 2242 - io_cqring_do_overflow_flush(ctx); 2243 - if (__io_cqring_events_user(ctx) >= min_events) 2244 - return 0; 2245 - 2246 - init_waitqueue_func_entry(&iowq.wq, io_wake_function); 2247 - iowq.wq.private = current; 2248 - INIT_LIST_HEAD(&iowq.wq.entry); 2249 - iowq.ctx = ctx; 2250 - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 2251 - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); 2252 - iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 2253 - iowq.hit_timeout = 0; 2254 - iowq.min_timeout = ext_arg->min_time; 2255 - iowq.timeout = KTIME_MAX; 2256 - start_time = io_get_time(ctx); 2257 - 2258 - if (ext_arg->ts_set) { 2259 - iowq.timeout = timespec64_to_ktime(ext_arg->ts); 2260 - if (!(flags & IORING_ENTER_ABS_TIMER)) 2261 - iowq.timeout = ktime_add(iowq.timeout, start_time); 2262 - } 2263 - 2264 - if (ext_arg->sig) { 2265 - #ifdef CONFIG_COMPAT 2266 - if (in_compat_syscall()) 2267 - ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, 2268 - ext_arg->argsz); 2269 - else 2270 - #endif 2271 - ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); 2272 - 2273 - if (ret) 2274 - return ret; 2275 - } 2276 - 2277 - io_napi_busy_loop(ctx, &iowq); 2278 - 2279 - trace_io_uring_cqring_wait(ctx, min_events); 2280 - do { 2281 - unsigned long check_cq; 2282 - int nr_wait; 2283 - 2284 - /* if min timeout has been hit, don't reset wait count */ 2285 - if (!iowq.hit_timeout) 2286 - nr_wait = (int) iowq.cq_tail - 2287 - READ_ONCE(ctx->rings->cq.tail); 2288 - else 2289 - nr_wait = 1; 2290 - 2291 - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 2292 - atomic_set(&ctx->cq_wait_nr, nr_wait); 2293 - set_current_state(TASK_INTERRUPTIBLE); 2294 - } else { 2295 - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 2296 - TASK_INTERRUPTIBLE); 2297 - } 2298 - 2299 - ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); 2300 - __set_current_state(TASK_RUNNING); 2301 - atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 2302 - 2303 - /* 2304 - * Run task_work after scheduling and before io_should_wake(). 2305 - * If we got woken because of task_work being processed, run it 2306 - * now rather than let the caller do another wait loop. 2307 - */ 2308 - if (io_local_work_pending(ctx)) 2309 - io_run_local_work(ctx, nr_wait, nr_wait); 2310 - io_run_task_work(); 2311 - 2312 - /* 2313 - * Non-local task_work will be run on exit to userspace, but 2314 - * if we're using DEFER_TASKRUN, then we could have waited 2315 - * with a timeout for a number of requests. If the timeout 2316 - * hits, we could have some requests ready to process. Ensure 2317 - * this break is _after_ we have run task_work, to avoid 2318 - * deferring running potentially pending requests until the 2319 - * next time we wait for events. 2320 - */ 2321 - if (ret < 0) 2322 - break; 2323 - 2324 - check_cq = READ_ONCE(ctx->check_cq); 2325 - if (unlikely(check_cq)) { 2326 - /* let the caller flush overflows, retry */ 2327 - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 2328 - io_cqring_do_overflow_flush(ctx); 2329 - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { 2330 - ret = -EBADR; 2331 - break; 2332 - } 2333 - } 2334 - 2335 - if (io_should_wake(&iowq)) { 2336 - ret = 0; 2337 - break; 2338 - } 2339 - cond_resched(); 2340 - } while (1); 2341 - 2342 - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 2343 - finish_wait(&ctx->cq_wait, &iowq.wq); 2344 - restore_saved_sigmask_unless(ret == -EINTR); 2345 - 2346 - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 2347 2061 } 2348 2062 2349 2063 static void io_rings_free(struct io_ring_ctx *ctx)

+1

io_uring/tw.c

··· 13 13 #include "poll.h" 14 14 #include "rw.h" 15 15 #include "eventfd.h" 16 + #include "wait.h" 16 17 17 18 void io_fallback_req_func(struct work_struct *work) 18 19 {

-8

io_uring/tw.h

··· 9 9 #define IO_LOCAL_TW_DEFAULT_MAX 20 10 10 11 11 /* 12 - * No waiters. It's larger than any valid value of the tw counter 13 - * so that tests against ->cq_wait_nr would fail and skip wake_up(). 14 - */ 15 - #define IO_CQ_WAKE_INIT (-1U) 16 - /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 17 - #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 18 - 19 - /* 20 12 * Terminate the request if either of these conditions are true: 21 13 * 22 14 * 1) It's being executed by the original task, but that task is marked

+308

io_uring/wait.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Waiting for completion events 4 + */ 5 + #include <linux/kernel.h> 6 + #include <linux/sched/signal.h> 7 + #include <linux/io_uring.h> 8 + 9 + #include <trace/events/io_uring.h> 10 + 11 + #include <uapi/linux/io_uring.h> 12 + 13 + #include "io_uring.h" 14 + #include "napi.h" 15 + #include "wait.h" 16 + 17 + static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, 18 + int wake_flags, void *key) 19 + { 20 + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); 21 + 22 + /* 23 + * Cannot safely flush overflowed CQEs from here, ensure we wake up 24 + * the task, and the next invocation will do it. 25 + */ 26 + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 27 + return autoremove_wake_function(curr, mode, wake_flags, key); 28 + return -1; 29 + } 30 + 31 + int io_run_task_work_sig(struct io_ring_ctx *ctx) 32 + { 33 + if (io_local_work_pending(ctx)) { 34 + __set_current_state(TASK_RUNNING); 35 + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) 36 + return 0; 37 + } 38 + if (io_run_task_work() > 0) 39 + return 0; 40 + if (task_sigpending(current)) 41 + return -EINTR; 42 + return 0; 43 + } 44 + 45 + static bool current_pending_io(void) 46 + { 47 + struct io_uring_task *tctx = current->io_uring; 48 + 49 + if (!tctx) 50 + return false; 51 + return percpu_counter_read_positive(&tctx->inflight); 52 + } 53 + 54 + static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer) 55 + { 56 + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 57 + 58 + WRITE_ONCE(iowq->hit_timeout, 1); 59 + iowq->min_timeout = 0; 60 + wake_up_process(iowq->wq.private); 61 + return HRTIMER_NORESTART; 62 + } 63 + 64 + /* 65 + * Doing min_timeout portion. If we saw any timeouts, events, or have work, 66 + * wake up. If not, and we have a normal timeout, switch to that and keep 67 + * sleeping. 68 + */ 69 + static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) 70 + { 71 + struct io_wait_queue *iowq = container_of(timer, struct io_wait_queue, t); 72 + struct io_ring_ctx *ctx = iowq->ctx; 73 + 74 + /* no general timeout, or shorter (or equal), we are done */ 75 + if (iowq->timeout == KTIME_MAX || 76 + ktime_compare(iowq->min_timeout, iowq->timeout) >= 0) 77 + goto out_wake; 78 + /* work we may need to run, wake function will see if we need to wake */ 79 + if (io_has_work(ctx)) 80 + goto out_wake; 81 + /* got events since we started waiting, min timeout is done */ 82 + if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) 83 + goto out_wake; 84 + /* if we have any events and min timeout expired, we're done */ 85 + if (io_cqring_events(ctx)) 86 + goto out_wake; 87 + 88 + /* 89 + * If using deferred task_work running and application is waiting on 90 + * more than one request, ensure we reset it now where we are switching 91 + * to normal sleeps. Any request completion post min_wait should wake 92 + * the task and return. 93 + */ 94 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 95 + atomic_set(&ctx->cq_wait_nr, 1); 96 + smp_mb(); 97 + if (!llist_empty(&ctx->work_llist)) 98 + goto out_wake; 99 + } 100 + 101 + /* any generated CQE posted past this time should wake us up */ 102 + iowq->cq_tail = iowq->cq_min_tail; 103 + 104 + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); 105 + hrtimer_set_expires(timer, iowq->timeout); 106 + return HRTIMER_RESTART; 107 + out_wake: 108 + return io_cqring_timer_wakeup(timer); 109 + } 110 + 111 + static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, 112 + clockid_t clock_id, ktime_t start_time) 113 + { 114 + ktime_t timeout; 115 + 116 + if (iowq->min_timeout) { 117 + timeout = ktime_add_ns(iowq->min_timeout, start_time); 118 + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, 119 + HRTIMER_MODE_ABS); 120 + } else { 121 + timeout = iowq->timeout; 122 + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, 123 + HRTIMER_MODE_ABS); 124 + } 125 + 126 + hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); 127 + hrtimer_start_expires(&iowq->t, HRTIMER_MODE_ABS); 128 + 129 + if (!READ_ONCE(iowq->hit_timeout)) 130 + schedule(); 131 + 132 + hrtimer_cancel(&iowq->t); 133 + destroy_hrtimer_on_stack(&iowq->t); 134 + __set_current_state(TASK_RUNNING); 135 + 136 + return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; 137 + } 138 + 139 + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, 140 + struct io_wait_queue *iowq, 141 + struct ext_arg *ext_arg, 142 + ktime_t start_time) 143 + { 144 + int ret = 0; 145 + 146 + /* 147 + * Mark us as being in io_wait if we have pending requests, so cpufreq 148 + * can take into account that the task is waiting for IO - turns out 149 + * to be important for low QD IO. 150 + */ 151 + if (ext_arg->iowait && current_pending_io()) 152 + current->in_iowait = 1; 153 + if (iowq->timeout != KTIME_MAX || iowq->min_timeout) 154 + ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); 155 + else 156 + schedule(); 157 + current->in_iowait = 0; 158 + return ret; 159 + } 160 + 161 + /* If this returns > 0, the caller should retry */ 162 + static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 163 + struct io_wait_queue *iowq, 164 + struct ext_arg *ext_arg, 165 + ktime_t start_time) 166 + { 167 + if (unlikely(READ_ONCE(ctx->check_cq))) 168 + return 1; 169 + if (unlikely(io_local_work_pending(ctx))) 170 + return 1; 171 + if (unlikely(task_work_pending(current))) 172 + return 1; 173 + if (unlikely(task_sigpending(current))) 174 + return -EINTR; 175 + if (unlikely(io_should_wake(iowq))) 176 + return 0; 177 + 178 + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); 179 + } 180 + 181 + /* 182 + * Wait until events become available, if we don't already have some. The 183 + * application must reap them itself, as they reside on the shared cq ring. 184 + */ 185 + int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 186 + struct ext_arg *ext_arg) 187 + { 188 + struct io_wait_queue iowq; 189 + struct io_rings *rings = ctx->rings; 190 + ktime_t start_time; 191 + int ret; 192 + 193 + min_events = min_t(int, min_events, ctx->cq_entries); 194 + 195 + if (!io_allowed_run_tw(ctx)) 196 + return -EEXIST; 197 + if (io_local_work_pending(ctx)) 198 + io_run_local_work(ctx, min_events, 199 + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); 200 + io_run_task_work(); 201 + 202 + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) 203 + io_cqring_do_overflow_flush(ctx); 204 + if (__io_cqring_events_user(ctx) >= min_events) 205 + return 0; 206 + 207 + init_waitqueue_func_entry(&iowq.wq, io_wake_function); 208 + iowq.wq.private = current; 209 + INIT_LIST_HEAD(&iowq.wq.entry); 210 + iowq.ctx = ctx; 211 + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 212 + iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); 213 + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 214 + iowq.hit_timeout = 0; 215 + iowq.min_timeout = ext_arg->min_time; 216 + iowq.timeout = KTIME_MAX; 217 + start_time = io_get_time(ctx); 218 + 219 + if (ext_arg->ts_set) { 220 + iowq.timeout = timespec64_to_ktime(ext_arg->ts); 221 + if (!(flags & IORING_ENTER_ABS_TIMER)) 222 + iowq.timeout = ktime_add(iowq.timeout, start_time); 223 + } 224 + 225 + if (ext_arg->sig) { 226 + #ifdef CONFIG_COMPAT 227 + if (in_compat_syscall()) 228 + ret = set_compat_user_sigmask((const compat_sigset_t __user *)ext_arg->sig, 229 + ext_arg->argsz); 230 + else 231 + #endif 232 + ret = set_user_sigmask(ext_arg->sig, ext_arg->argsz); 233 + 234 + if (ret) 235 + return ret; 236 + } 237 + 238 + io_napi_busy_loop(ctx, &iowq); 239 + 240 + trace_io_uring_cqring_wait(ctx, min_events); 241 + do { 242 + unsigned long check_cq; 243 + int nr_wait; 244 + 245 + /* if min timeout has been hit, don't reset wait count */ 246 + if (!iowq.hit_timeout) 247 + nr_wait = (int) iowq.cq_tail - 248 + READ_ONCE(ctx->rings->cq.tail); 249 + else 250 + nr_wait = 1; 251 + 252 + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 253 + atomic_set(&ctx->cq_wait_nr, nr_wait); 254 + set_current_state(TASK_INTERRUPTIBLE); 255 + } else { 256 + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, 257 + TASK_INTERRUPTIBLE); 258 + } 259 + 260 + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); 261 + __set_current_state(TASK_RUNNING); 262 + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); 263 + 264 + /* 265 + * Run task_work after scheduling and before io_should_wake(). 266 + * If we got woken because of task_work being processed, run it 267 + * now rather than let the caller do another wait loop. 268 + */ 269 + if (io_local_work_pending(ctx)) 270 + io_run_local_work(ctx, nr_wait, nr_wait); 271 + io_run_task_work(); 272 + 273 + /* 274 + * Non-local task_work will be run on exit to userspace, but 275 + * if we're using DEFER_TASKRUN, then we could have waited 276 + * with a timeout for a number of requests. If the timeout 277 + * hits, we could have some requests ready to process. Ensure 278 + * this break is _after_ we have run task_work, to avoid 279 + * deferring running potentially pending requests until the 280 + * next time we wait for events. 281 + */ 282 + if (ret < 0) 283 + break; 284 + 285 + check_cq = READ_ONCE(ctx->check_cq); 286 + if (unlikely(check_cq)) { 287 + /* let the caller flush overflows, retry */ 288 + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) 289 + io_cqring_do_overflow_flush(ctx); 290 + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { 291 + ret = -EBADR; 292 + break; 293 + } 294 + } 295 + 296 + if (io_should_wake(&iowq)) { 297 + ret = 0; 298 + break; 299 + } 300 + cond_resched(); 301 + } while (1); 302 + 303 + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 304 + finish_wait(&ctx->cq_wait, &iowq.wq); 305 + restore_saved_sigmask_unless(ret == -EINTR); 306 + 307 + return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 308 + }

+49

io_uring/wait.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #ifndef IOU_WAIT_H 3 + #define IOU_WAIT_H 4 + 5 + #include <linux/io_uring_types.h> 6 + 7 + /* 8 + * No waiters. It's larger than any valid value of the tw counter 9 + * so that tests against ->cq_wait_nr would fail and skip wake_up(). 10 + */ 11 + #define IO_CQ_WAKE_INIT (-1U) 12 + /* Forced wake up if there is a waiter regardless of ->cq_wait_nr */ 13 + #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) 14 + 15 + struct ext_arg { 16 + size_t argsz; 17 + struct timespec64 ts; 18 + const sigset_t __user *sig; 19 + ktime_t min_time; 20 + bool ts_set; 21 + bool iowait; 22 + }; 23 + 24 + int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, 25 + struct ext_arg *ext_arg); 26 + int io_run_task_work_sig(struct io_ring_ctx *ctx); 27 + void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); 28 + 29 + static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 30 + { 31 + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 32 + } 33 + 34 + static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) 35 + { 36 + return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 37 + } 38 + 39 + /* 40 + * Reads the tail/head of the CQ ring while providing an acquire ordering, 41 + * see comment at top of io_uring.c. 42 + */ 43 + static inline unsigned io_cqring_events(struct io_ring_ctx *ctx) 44 + { 45 + smp_rmb(); 46 + return __io_cqring_events(ctx); 47 + } 48 + 49 + #endif

Configure Feed

Configure Feed