Merge tag 'io_uring-7.0-20260403' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

+5 -2

io_uring/io_uring.c

··· 2015 2015 if (ctx->flags & IORING_SETUP_SQ_REWIND) 2016 2016 entries = ctx->sq_entries; 2017 2017 else 2018 - entries = io_sqring_entries(ctx); 2018 + entries = __io_sqring_entries(ctx); 2019 2019 2020 2020 entries = min(nr, entries); 2021 2021 if (unlikely(!entries)) ··· 2250 2250 */ 2251 2251 poll_wait(file, &ctx->poll_wq, wait); 2252 2252 2253 - if (!io_sqring_full(ctx)) 2253 + rcu_read_lock(); 2254 + 2255 + if (!__io_sqring_full(ctx)) 2254 2256 mask |= EPOLLOUT | EPOLLWRNORM; 2255 2257 2256 2258 /* ··· 2272 2270 if (__io_cqring_events_user(ctx) || io_has_work(ctx)) 2273 2271 mask |= EPOLLIN | EPOLLRDNORM; 2274 2272 2273 + rcu_read_unlock(); 2275 2274 return mask; 2276 2275 } 2277 2276

+29 -5

io_uring/io_uring.h

··· 142 142 #endif 143 143 }; 144 144 145 + static inline struct io_rings *io_get_rings(struct io_ring_ctx *ctx) 146 + { 147 + return rcu_dereference_check(ctx->rings_rcu, 148 + lockdep_is_held(&ctx->uring_lock) || 149 + lockdep_is_held(&ctx->completion_lock)); 150 + } 151 + 145 152 static inline bool io_should_wake(struct io_wait_queue *iowq) 146 153 { 147 154 struct io_ring_ctx *ctx = iowq->ctx; 148 - int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail; 155 + struct io_rings *rings; 156 + int dist; 157 + 158 + guard(rcu)(); 159 + rings = io_get_rings(ctx); 149 160 150 161 /* 151 162 * Wake up if we have enough events, or if a timeout occurred since we 152 163 * started waiting. For timeouts, we always want to return to userspace, 153 164 * regardless of event count. 154 165 */ 166 + dist = READ_ONCE(rings->cq.tail) - (int) iowq->cq_tail; 155 167 return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; 156 168 } 157 169 ··· 443 431 __io_wq_wake(&ctx->cq_wait); 444 432 } 445 433 446 - static inline bool io_sqring_full(struct io_ring_ctx *ctx) 434 + static inline bool __io_sqring_full(struct io_ring_ctx *ctx) 447 435 { 448 - struct io_rings *r = ctx->rings; 436 + struct io_rings *r = io_get_rings(ctx); 449 437 450 438 /* 451 439 * SQPOLL must use the actual sqring head, as using the cached_sq_head ··· 457 445 return READ_ONCE(r->sq.tail) - READ_ONCE(r->sq.head) == ctx->sq_entries; 458 446 } 459 447 460 - static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 448 + static inline bool io_sqring_full(struct io_ring_ctx *ctx) 461 449 { 462 - struct io_rings *rings = ctx->rings; 450 + guard(rcu)(); 451 + return __io_sqring_full(ctx); 452 + } 453 + 454 + static inline unsigned int __io_sqring_entries(struct io_ring_ctx *ctx) 455 + { 456 + struct io_rings *rings = io_get_rings(ctx); 463 457 unsigned int entries; 464 458 465 459 /* make sure SQ entry isn't read before tail */ 466 460 entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; 467 461 return min(entries, ctx->sq_entries); 462 + } 463 + 464 + static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) 465 + { 466 + guard(rcu)(); 467 + return __io_sqring_entries(ctx); 468 468 } 469 469 470 470 /*

+4

io_uring/net.c

··· 421 421 422 422 sr->done_io = 0; 423 423 sr->len = READ_ONCE(sqe->len); 424 + if (unlikely(sr->len < 0)) 425 + return -EINVAL; 424 426 sr->flags = READ_ONCE(sqe->ioprio); 425 427 if (sr->flags & ~SENDMSG_FLAGS) 426 428 return -EINVAL; ··· 793 791 794 792 sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 795 793 sr->len = READ_ONCE(sqe->len); 794 + if (unlikely(sr->len < 0)) 795 + return -EINVAL; 796 796 sr->flags = READ_ONCE(sqe->ioprio); 797 797 if (sr->flags & ~RECVMSG_FLAGS) 798 798 return -EINVAL;

+9 -1

io_uring/register.c

··· 178 178 return -EBUSY; 179 179 180 180 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); 181 - /* Reset all restrictions if an error happened */ 181 + /* 182 + * Reset all restrictions if an error happened, but retain any COW'ed 183 + * settings. 184 + */ 182 185 if (ret < 0) { 186 + struct io_bpf_filters *bpf = ctx->restrictions.bpf_filters; 187 + bool cowed = ctx->restrictions.bpf_filters_cow; 188 + 183 189 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); 190 + ctx->restrictions.bpf_filters = bpf; 191 + ctx->restrictions.bpf_filters_cow = cowed; 184 192 return ret; 185 193 } 186 194 if (ctx->restrictions.op_registered)

+4

io_uring/rsrc.c

··· 1061 1061 return ret; 1062 1062 if (!(imu->dir & (1 << ddir))) 1063 1063 return -EFAULT; 1064 + if (unlikely(!len)) { 1065 + iov_iter_bvec(iter, ddir, NULL, 0, 0); 1066 + return 0; 1067 + } 1064 1068 1065 1069 offset = buf_addr - imu->ubuf; 1066 1070

+31 -19

io_uring/wait.c

··· 79 79 if (io_has_work(ctx)) 80 80 goto out_wake; 81 81 /* got events since we started waiting, min timeout is done */ 82 - if (iowq->cq_min_tail != READ_ONCE(ctx->rings->cq.tail)) 83 - goto out_wake; 84 - /* if we have any events and min timeout expired, we're done */ 85 - if (io_cqring_events(ctx)) 86 - goto out_wake; 82 + scoped_guard(rcu) { 83 + struct io_rings *rings = io_get_rings(ctx); 87 84 85 + if (iowq->cq_min_tail != READ_ONCE(rings->cq.tail)) 86 + goto out_wake; 87 + /* if we have any events and min timeout expired, we're done */ 88 + if (io_cqring_events(ctx)) 89 + goto out_wake; 90 + } 88 91 /* 89 92 * If using deferred task_work running and application is waiting on 90 93 * more than one request, ensure we reset it now where we are switching ··· 189 186 struct ext_arg *ext_arg) 190 187 { 191 188 struct io_wait_queue iowq; 192 - struct io_rings *rings = ctx->rings; 189 + struct io_rings *rings; 193 190 ktime_t start_time; 194 - int ret; 191 + int ret, nr_wait; 195 192 196 193 min_events = min_t(int, min_events, ctx->cq_entries); 197 194 ··· 204 201 205 202 if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) 206 203 io_cqring_do_overflow_flush(ctx); 207 - if (__io_cqring_events_user(ctx) >= min_events) 204 + 205 + rcu_read_lock(); 206 + rings = io_get_rings(ctx); 207 + if (__io_cqring_events_user(ctx) >= min_events) { 208 + rcu_read_unlock(); 208 209 return 0; 210 + } 209 211 210 212 init_waitqueue_func_entry(&iowq.wq, io_wake_function); 211 213 iowq.wq.private = current; 212 214 INIT_LIST_HEAD(&iowq.wq.entry); 213 215 iowq.ctx = ctx; 214 - iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; 215 - iowq.cq_min_tail = READ_ONCE(ctx->rings->cq.tail); 216 + iowq.cq_tail = READ_ONCE(rings->cq.head) + min_events; 217 + iowq.cq_min_tail = READ_ONCE(rings->cq.tail); 218 + nr_wait = (int) iowq.cq_tail - READ_ONCE(rings->cq.tail); 219 + rcu_read_unlock(); 220 + rings = NULL; 216 221 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 217 222 iowq.hit_timeout = 0; 218 223 iowq.min_timeout = ext_arg->min_time; ··· 251 240 trace_io_uring_cqring_wait(ctx, min_events); 252 241 do { 253 242 unsigned long check_cq; 254 - int nr_wait; 255 - 256 - /* if min timeout has been hit, don't reset wait count */ 257 - if (!iowq.hit_timeout) 258 - nr_wait = (int) iowq.cq_tail - 259 - READ_ONCE(ctx->rings->cq.tail); 260 - else 261 - nr_wait = 1; 262 243 263 244 if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { 264 245 atomic_set(&ctx->cq_wait_nr, nr_wait); ··· 301 298 break; 302 299 } 303 300 cond_resched(); 301 + 302 + /* if min timeout has been hit, don't reset wait count */ 303 + if (!iowq.hit_timeout) 304 + scoped_guard(rcu) 305 + nr_wait = (int) iowq.cq_tail - 306 + READ_ONCE(io_get_rings(ctx)->cq.tail); 307 + else 308 + nr_wait = 1; 304 309 } while (1); 305 310 306 311 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) 307 312 finish_wait(&ctx->cq_wait, &iowq.wq); 308 313 restore_saved_sigmask_unless(ret == -EINTR); 309 314 310 - return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 315 + guard(rcu)(); 316 + return READ_ONCE(io_get_rings(ctx)->cq.head) == READ_ONCE(io_get_rings(ctx)->cq.tail) ? ret : 0; 311 317 }

+5 -2

io_uring/wait.h

··· 28 28 29 29 static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) 30 30 { 31 - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); 31 + struct io_rings *rings = io_get_rings(ctx); 32 + return ctx->cached_cq_tail - READ_ONCE(rings->cq.head); 32 33 } 33 34 34 35 static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx) 35 36 { 36 - return READ_ONCE(ctx->rings->cq.tail) - READ_ONCE(ctx->rings->cq.head); 37 + struct io_rings *rings = io_get_rings(ctx); 38 + 39 + return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); 37 40 } 38 41 39 42 /*

Configure Feed

Configure Feed