Merge tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring fixes from Jens Axboe:

- Remove dead struct io_buffer_list member

- Fix for incrementally consumed buffers with recvmsg multishot, which
requires a minimum value left in a buffer for any receive for the
headers. If there's still a bit of buffer left but it's smaller than
that value, then userspace will see a spurious -EFAULT returned in
the CQE

- Locking fix for the DEFER_TASKRUN retry list, which otherwise could
race with fallback cancelations. If the task is exiting with
task_work left in both the normal and retry list AND the exit cleanup
races with the task running task work, then entries could either be
doubly completed or lost

- Cap NAPI busy poll timeout to something sane, to avoid syzbot running
into excessive polling and triggering warnings around that

* tag 'io_uring-7.1-20260430' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux:
io_uring/tw: serialize ctx->retry_llist with ->uring_lock
io_uring/napi: cap busy_poll_to 10 msec
io_uring/kbuf: support min length left for incremental buffers
io_uring/kbuf: kill dead struct io_buffer_list 'nr_entries' member

Linus Torvalds 3 days ago 9d88bb92 33d0c9c5

+29 -5

5 changed files

expand all

include

uapi

linux

io_uring.h

io_uring

kbuf.c

kbuf.h

napi.c

tw.c

+2 -1

include/uapi/linux/io_uring.h

··· 905 905 __u32 ring_entries; 906 906 __u16 bgid; 907 907 __u16 flags; 908 - __u64 resv[3]; 908 + __u32 min_left; 909 + __u32 resv[5]; 909 910 }; 910 911 911 912 /* argument for IORING_REGISTER_PBUF_STATUS */

+7 -2

io_uring/kbuf.c

··· 47 47 this_len = min_t(u32, len, buf_len); 48 48 buf_len -= this_len; 49 49 /* Stop looping for invalid buffer length of 0 */ 50 - if (buf_len || !this_len) { 50 + if (buf_len > bl->min_left_sub_one || !this_len) { 51 51 WRITE_ONCE(buf->addr, READ_ONCE(buf->addr) + this_len); 52 52 WRITE_ONCE(buf->len, buf_len); 53 53 return false; ··· 637 637 if (reg.ring_entries >= 65536) 638 638 return -EINVAL; 639 639 640 + /* minimum left byte count is a property of incremental buffers */ 641 + if (!(reg.flags & IOU_PBUF_RING_INC) && reg.min_left) 642 + return -EINVAL; 643 + 640 644 bl = io_buffer_get_list(ctx, reg.bgid); 641 645 if (bl) { 642 646 /* if mapped buffer ring OR classic exists, don't allow */ ··· 684 680 } 685 681 #endif 686 682 687 - bl->nr_entries = reg.ring_entries; 688 683 bl->mask = reg.ring_entries - 1; 689 684 bl->flags |= IOBL_BUF_RING; 690 685 bl->buf_ring = br; 686 + if (reg.min_left) 687 + bl->min_left_sub_one = reg.min_left - 1; 691 688 if (reg.flags & IOU_PBUF_RING_INC) 692 689 bl->flags |= IOBL_INC; 693 690 ret = io_buffer_add_list(ctx, bl, reg.bgid);

+7 -1

io_uring/kbuf.h

··· 27 27 __u16 bgid; 28 28 29 29 /* below is for ring provided buffers */ 30 - __u16 nr_entries; 31 30 __u16 head; 32 31 __u16 mask; 33 32 34 33 __u16 flags; 34 + 35 + /* 36 + * minimum required amount to be left to reuse an incrementally 37 + * consumed buffer. If less than this is left at consumption time, 38 + * buffer is done and head is incremented to the next buffer. 39 + */ 40 + __u32 min_left_sub_one; 35 41 36 42 struct io_mapped_region region; 37 43 };

io_uring/napi.c

··· 276 276 /* clean the napi list for new settings */ 277 277 io_napi_free(ctx); 278 278 WRITE_ONCE(ctx->napi_track_mode, napi->op_param); 279 + /* cap NAPI at 10 msec of spin time */ 280 + napi->busy_poll_to = min(10000, napi->busy_poll_to); 279 281 WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); 280 282 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); 281 283 return 0;

+11 -1

io_uring/tw.c

··· 273 273 274 274 void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) 275 275 { 276 - struct llist_node *node = llist_del_all(&ctx->work_llist); 276 + struct llist_node *node; 277 277 278 + /* 279 + * Running the work items may utilize ->retry_llist as a means 280 + * for capping the number of task_work entries run at the same 281 + * time. But that list can potentially race with moving the work 282 + * from here, if the task is exiting. As any normal task_work 283 + * running holds ->uring_lock already, just guard this slow path 284 + * with ->uring_lock to avoid racing on ->retry_llist. 285 + */ 286 + guard(mutex)(&ctx->uring_lock); 287 + node = llist_del_all(&ctx->work_llist); 278 288 __io_fallback_tw(node, false); 279 289 node = llist_del_all(&ctx->retry_llist); 280 290 __io_fallback_tw(node, false);

Configure Feed

Configure Feed