Merge tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

- Fix an inverted true/false comment on task_no_new_privs, from the
BPF filtering changes merged in this release

- Use the migration disabling way of running the BPF filters, as the
io_uring side doesn't do that already

- Fix an issue with ->rings stability under resize, both for local
task_work additions and for eventfd signaling

- Fix an issue with SQE mixed mode, where a bounds check wasn't correct
for having a 128b SQE

- Fix an issue where a legacy provided buffer group is changed to to
ring mapped one while legacy buffers from that group are in flight

* tag 'io_uring-7.0-20260312' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
io_uring/kbuf: check if target buffer list is still legacy on recycle
io_uring: fix physical SQE bounds check for SQE_MIXED 128-byte ops
io_uring/eventfd: use ctx->rings_rcu for flags checking
io_uring: ensure ctx->rings is stable for task work flags manipulation
io_uring/bpf_filter: use bpf_prog_run_pin_on_cpu() to prevent migration
io_uring/register: fix comment about task_no_new_privs

Linus Torvalds 3 months ago e67bf352 8174dafb

+56 -11

7 changed files

expand all

include

linux

io_uring_types.h

io_uring

bpf_filter.c

eventfd.c

io_uring.c

kbuf.c

tw.c

include/linux/io_uring_types.h

··· 388 388 * regularly bounce b/w CPUs. 389 389 */ 390 390 struct { 391 + struct io_rings __rcu *rings_rcu; 391 392 struct llist_head work_llist; 392 393 struct llist_head retry_llist; 393 394 unsigned long check_cq;

+1 -1

io_uring/bpf_filter.c

··· 85 85 do { 86 86 if (filter == &dummy_filter) 87 87 return -EACCES; 88 - ret = bpf_prog_run(filter->prog, &bpf_ctx); 88 + ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx); 89 89 if (!ret) 90 90 return -EACCES; 91 91 filter = filter->next;

+7 -3

io_uring/eventfd.c

··· 76 76 { 77 77 bool skip = false; 78 78 struct io_ev_fd *ev_fd; 79 - 80 - if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 81 - return; 79 + struct io_rings *rings; 82 80 83 81 guard(rcu)(); 82 + 83 + rings = rcu_dereference(ctx->rings_rcu); 84 + if (!rings) 85 + return; 86 + if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) 87 + return; 84 88 ev_fd = rcu_dereference(ctx->io_ev_fd); 85 89 /* 86 90 * Check again if ev_fd exists in case an io_eventfd_unregister call

+3 -1

io_uring/io_uring.c

··· 1745 1745 * well as 2 contiguous entries. 1746 1746 */ 1747 1747 if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 || 1748 - !(ctx->cached_sq_head & (ctx->sq_entries - 1))) 1748 + (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1) 1749 1749 return io_init_fail_req(req, -EINVAL); 1750 1750 /* 1751 1751 * A 128b operation on a mixed SQ uses two entries, so we have ··· 2066 2066 io_free_region(ctx->user, &ctx->sq_region); 2067 2067 io_free_region(ctx->user, &ctx->ring_region); 2068 2068 ctx->rings = NULL; 2069 + RCU_INIT_POINTER(ctx->rings_rcu, NULL); 2069 2070 ctx->sq_sqes = NULL; 2070 2071 } 2071 2072 ··· 2704 2703 if (ret) 2705 2704 return ret; 2706 2705 ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); 2706 + rcu_assign_pointer(ctx->rings_rcu, rings); 2707 2707 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) 2708 2708 ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset); 2709 2709

+11 -2

io_uring/kbuf.c

··· 111 111 112 112 buf = req->kbuf; 113 113 bl = io_buffer_get_list(ctx, buf->bgid); 114 - list_add(&buf->list, &bl->buf_list); 115 - bl->nbufs++; 114 + /* 115 + * If the buffer list was upgraded to a ring-based one, or removed, 116 + * while the request was in-flight in io-wq, drop it. 117 + */ 118 + if (bl && !(bl->flags & IOBL_BUF_RING)) { 119 + list_add(&buf->list, &bl->buf_list); 120 + bl->nbufs++; 121 + } else { 122 + kfree(buf); 123 + } 116 124 req->flags &= ~REQ_F_BUFFER_SELECTED; 125 + req->kbuf = NULL; 117 126 118 127 io_ring_submit_unlock(ctx, issue_flags); 119 128 return true;

+13 -2

io_uring/register.c

··· 202 202 return -EPERM; 203 203 /* 204 204 * Similar to seccomp, disallow setting a filter if task_no_new_privs 205 - * is true and we're not CAP_SYS_ADMIN. 205 + * is false and we're not CAP_SYS_ADMIN. 206 206 */ 207 207 if (!task_no_new_privs(current) && 208 208 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) ··· 238 238 239 239 /* 240 240 * Similar to seccomp, disallow setting a filter if task_no_new_privs 241 - * is true and we're not CAP_SYS_ADMIN. 241 + * is false and we're not CAP_SYS_ADMIN. 242 242 */ 243 243 if (!task_no_new_privs(current) && 244 244 !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) ··· 633 633 ctx->sq_entries = p->sq_entries; 634 634 ctx->cq_entries = p->cq_entries; 635 635 636 + /* 637 + * Just mark any flag we may have missed and that the application 638 + * should act on unconditionally. Worst case it'll be an extra 639 + * syscall. 640 + */ 641 + atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags); 636 642 ctx->rings = n.rings; 643 + rcu_assign_pointer(ctx->rings_rcu, n.rings); 644 + 637 645 ctx->sq_sqes = n.sq_sqes; 638 646 swap_old(ctx, o, n, ring_region); 639 647 swap_old(ctx, o, n, sq_region); ··· 650 642 out: 651 643 spin_unlock(&ctx->completion_lock); 652 644 mutex_unlock(&ctx->mmap_lock); 645 + /* Wait for concurrent io_ctx_mark_taskrun() */ 646 + if (to_free == &o) 647 + synchronize_rcu_expedited(); 653 648 io_register_free_rings(ctx, to_free); 654 649 655 650 if (ctx->sq_data)

+20 -2

io_uring/tw.c

··· 152 152 WARN_ON_ONCE(ret); 153 153 } 154 154 155 + /* 156 + * Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the 157 + * RCU protected rings pointer to be safe against concurrent ring resizing. 158 + */ 159 + static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx) 160 + { 161 + lockdep_assert_in_rcu_read_lock(); 162 + 163 + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) { 164 + struct io_rings *rings = rcu_dereference(ctx->rings_rcu); 165 + 166 + atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags); 167 + } 168 + } 169 + 155 170 void io_req_local_work_add(struct io_kiocb *req, unsigned flags) 156 171 { 157 172 struct io_ring_ctx *ctx = req->ctx; ··· 221 206 */ 222 207 223 208 if (!head) { 224 - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 225 - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 209 + io_ctx_mark_taskrun(ctx); 226 210 if (ctx->has_evfd) 227 211 io_eventfd_signal(ctx, false); 228 212 } ··· 245 231 if (!llist_add(&req->io_task_work.node, &tctx->task_list)) 246 232 return; 247 233 234 + /* 235 + * Doesn't need to use ->rings_rcu, as resizing isn't supported for 236 + * !DEFER_TASKRUN. 237 + */ 248 238 if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) 249 239 atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); 250 240

Configure Feed

Configure Feed