Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: drain based on allocates reqs

Don't rely on CQ sequence numbers for draining, as it has become messy
and needs cq_extra adjustments. Instead, base it on the number of
allocated requests and only allow flushing when all requests are in the
drain list.

As a result, cq_extra is gone, no overhead for its accounting in aux cqe
posting, less bloating as it was inlined before, and it's in general
simpler than trying to track where we should bump it and where it should
be put back like in cases of overflow. Also, it'll likely help with
cleaning and unifying some of the CQ posting helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com
Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com
[axboe: fold in fix from link2]
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Pavel Begunkov and committed by
Jens Axboe
8fb7aee0 63de899c

+34 -50
+1 -1
include/linux/io_uring_types.h
··· 341 341 unsigned cached_cq_tail; 342 342 unsigned cq_entries; 343 343 struct io_ev_fd __rcu *io_ev_fd; 344 - unsigned cq_extra; 345 344 346 345 void *cq_wait_arg; 347 346 size_t cq_wait_size; ··· 416 417 417 418 struct callback_head poll_wq_task_work; 418 419 struct list_head defer_list; 420 + unsigned nr_drained; 419 421 420 422 struct io_alloc_cache msg_cache; 421 423 spinlock_t msg_lock;
+32 -47
io_uring/io_uring.c
··· 129 129 struct io_defer_entry { 130 130 struct list_head list; 131 131 struct io_kiocb *req; 132 - u32 seq; 133 132 }; 134 133 135 134 /* requests with any of those set should undergo io_disarm_next() */ ··· 148 149 bool is_sqpoll_thread); 149 150 150 151 static void io_queue_sqe(struct io_kiocb *req); 152 + static void __io_req_caches_free(struct io_ring_ctx *ctx); 151 153 152 154 static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); 153 155 ··· 540 540 io_req_task_work_add(req); 541 541 } 542 542 543 - static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq) 543 + static unsigned io_linked_nr(struct io_kiocb *req) 544 544 { 545 - struct io_ring_ctx *ctx = req->ctx; 545 + struct io_kiocb *tmp; 546 + unsigned nr = 0; 546 547 547 - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; 548 + io_for_each_link(tmp, req) 549 + nr++; 550 + return nr; 548 551 } 549 552 550 - static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx) 553 + static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) 551 554 { 552 555 bool drain_seen = false, first = true; 556 + 557 + lockdep_assert_held(&ctx->uring_lock); 558 + __io_req_caches_free(ctx); 553 559 554 560 while (!list_empty(&ctx->defer_list)) { 555 561 struct io_defer_entry *de = list_first_entry(&ctx->defer_list, 556 562 struct io_defer_entry, list); 557 563 558 564 drain_seen |= de->req->flags & REQ_F_IO_DRAIN; 559 - if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq)) 560 - break; 565 + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) 566 + return; 561 567 562 568 list_del_init(&de->list); 569 + ctx->nr_drained -= io_linked_nr(de->req); 563 570 io_req_task_queue(de->req); 564 571 kfree(de); 565 572 first = false; 566 573 } 567 - } 568 - 569 - static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) 570 - { 571 - guard(spinlock)(&ctx->completion_lock); 572 - __io_queue_deferred(ctx); 573 574 } 574 575 575 576 void __io_commit_cqring_flush(struct io_ring_ctx *ctx) ··· 579 578 io_poll_wq_wake(ctx); 580 579 if (ctx->off_timeout_used) 581 580 io_flush_timeouts(ctx); 582 - if (ctx->drain_active) 583 - io_queue_deferred(ctx); 584 581 if (ctx->has_evfd) 585 582 io_eventfd_signal(ctx, true); 586 583 } ··· 741 742 * on the floor. 742 743 */ 743 744 WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); 744 - ctx->cq_extra--; 745 745 set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); 746 746 return false; 747 747 } ··· 809 811 u32 cflags) 810 812 { 811 813 struct io_uring_cqe *cqe; 812 - 813 - ctx->cq_extra++; 814 814 815 815 if (likely(io_get_cqe(ctx, &cqe))) { 816 816 WRITE_ONCE(cqe->user_data, user_data); ··· 1455 1459 io_free_batch_list(ctx, state->compl_reqs.first); 1456 1460 INIT_WQ_LIST(&state->compl_reqs); 1457 1461 } 1462 + 1463 + if (unlikely(ctx->drain_active)) 1464 + io_queue_deferred(ctx); 1465 + 1458 1466 ctx->submit_state.cq_flush = false; 1459 1467 } 1460 1468 ··· 1646 1646 return res; 1647 1647 } 1648 1648 1649 - static u32 io_get_sequence(struct io_kiocb *req) 1650 - { 1651 - u32 seq = req->ctx->cached_sq_head; 1652 - struct io_kiocb *cur; 1653 - 1654 - /* need original cached_sq_head, but it was increased for each req */ 1655 - io_for_each_link(cur, req) 1656 - seq--; 1657 - return seq; 1658 - } 1659 - 1660 1649 static __cold void io_drain_req(struct io_kiocb *req) 1661 1650 __must_hold(&ctx->uring_lock) 1662 1651 { ··· 1662 1673 io_prep_async_link(req); 1663 1674 trace_io_uring_defer(req); 1664 1675 de->req = req; 1665 - de->seq = io_get_sequence(req); 1666 1676 1667 - scoped_guard(spinlock, &ctx->completion_lock) { 1668 - list_add_tail(&de->list, &ctx->defer_list); 1669 - __io_queue_deferred(ctx); 1670 - if (!drain && list_empty(&ctx->defer_list)) 1671 - ctx->drain_active = false; 1672 - } 1677 + ctx->nr_drained += io_linked_nr(req); 1678 + list_add_tail(&de->list, &ctx->defer_list); 1679 + io_queue_deferred(ctx); 1680 + if (!drain && list_empty(&ctx->defer_list)) 1681 + ctx->drain_active = false; 1673 1682 } 1674 1683 1675 1684 static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, ··· 2250 2263 (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { 2251 2264 head = READ_ONCE(ctx->sq_array[head]); 2252 2265 if (unlikely(head >= ctx->sq_entries)) { 2253 - /* drop invalid entries */ 2254 - spin_lock(&ctx->completion_lock); 2255 - ctx->cq_extra--; 2256 - spin_unlock(&ctx->completion_lock); 2257 2266 WRITE_ONCE(ctx->rings->sq_dropped, 2258 2267 READ_ONCE(ctx->rings->sq_dropped) + 1); 2259 2268 return false; ··· 2667 2684 return off; 2668 2685 } 2669 2686 2670 - static void io_req_caches_free(struct io_ring_ctx *ctx) 2687 + static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) 2671 2688 { 2672 2689 struct io_kiocb *req; 2673 2690 int nr = 0; 2674 - 2675 - mutex_lock(&ctx->uring_lock); 2676 2691 2677 2692 while (!io_req_cache_empty(ctx)) { 2678 2693 req = io_extract_req(ctx); ··· 2681 2700 ctx->nr_req_allocated -= nr; 2682 2701 percpu_ref_put_many(&ctx->refs, nr); 2683 2702 } 2684 - mutex_unlock(&ctx->uring_lock); 2703 + } 2704 + 2705 + static __cold void io_req_caches_free(struct io_ring_ctx *ctx) 2706 + { 2707 + guard(mutex)(&ctx->uring_lock); 2708 + __io_req_caches_free(ctx); 2685 2709 } 2686 2710 2687 2711 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ··· 2991 3005 struct io_defer_entry *de; 2992 3006 LIST_HEAD(list); 2993 3007 2994 - spin_lock(&ctx->completion_lock); 2995 3008 list_for_each_entry_reverse(de, &ctx->defer_list, list) { 2996 3009 if (io_match_task_safe(de->req, tctx, cancel_all)) { 2997 3010 list_cut_position(&list, &ctx->defer_list, &de->list); 2998 3011 break; 2999 3012 } 3000 3013 } 3001 - spin_unlock(&ctx->completion_lock); 3002 3014 if (list_empty(&list)) 3003 3015 return false; 3004 3016 3005 3017 while (!list_empty(&list)) { 3006 3018 de = list_first_entry(&list, struct io_defer_entry, list); 3007 3019 list_del_init(&de->list); 3020 + ctx->nr_drained -= io_linked_nr(de->req); 3008 3021 io_req_task_queue_fail(de->req, -ECANCELED); 3009 3022 kfree(de); 3010 3023 } ··· 3078 3093 if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && 3079 3094 io_allowed_defer_tw_run(ctx)) 3080 3095 ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; 3081 - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 3082 3096 mutex_lock(&ctx->uring_lock); 3097 + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); 3083 3098 ret |= io_poll_remove_all(ctx, tctx, cancel_all); 3084 3099 ret |= io_waitid_remove_all(ctx, tctx, cancel_all); 3085 3100 ret |= io_futex_remove_all(ctx, tctx, cancel_all);
+1 -2
io_uring/io_uring.h
··· 196 196 { 197 197 io_lockdep_assert_cq_locked(ctx); 198 198 199 - ctx->cq_extra++; 200 199 ctx->submit_state.cq_flush = true; 201 200 return io_get_cqe(ctx, cqe_ret); 202 201 } ··· 413 414 414 415 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) 415 416 { 416 - if (unlikely(ctx->off_timeout_used || ctx->drain_active || 417 + if (unlikely(ctx->off_timeout_used || 417 418 ctx->has_evfd || ctx->poll_activated)) 418 419 __io_commit_cqring_flush(ctx); 419 420 }