io_uring/io-wq: cache work->flags in variable

This eliminates several redundant atomic reads and therefore reduces
the duration the surrounding spinlocks are held.

In several io_uring benchmarks, this reduced the CPU time spent in
queued_spin_lock_slowpath() considerably:

io_uring benchmark with a flood of `IORING_OP_NOP` and `IOSQE_ASYNC`:

38.86% -1.49% [kernel.kallsyms] [k] queued_spin_lock_slowpath
6.75% +0.36% [kernel.kallsyms] [k] io_worker_handle_work
2.60% +0.19% [kernel.kallsyms] [k] io_nop
3.92% +0.18% [kernel.kallsyms] [k] io_req_task_complete
6.34% -0.18% [kernel.kallsyms] [k] io_wq_submit_work

HTTP server, static file:

42.79% -2.77% [kernel.kallsyms] [k] queued_spin_lock_slowpath
2.08% +0.23% [kernel.kallsyms] [k] io_wq_submit_work
1.19% +0.20% [kernel.kallsyms] [k] amd_iommu_iotlb_sync_map
1.46% +0.15% [kernel.kallsyms] [k] ep_poll_callback
1.80% +0.15% [kernel.kallsyms] [k] io_worker_handle_work

HTTP server, PHP:

35.03% -1.80% [kernel.kallsyms] [k] queued_spin_lock_slowpath
0.84% +0.21% [kernel.kallsyms] [k] amd_iommu_iotlb_sync_map
1.39% +0.12% [kernel.kallsyms] [k] _copy_to_iter
0.21% +0.10% [kernel.kallsyms] [k] update_sd_lb_stats

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Link: https://lore.kernel.org/r/20250128133927.3989681-5-max.kellermann@ionos.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Max Kellermann and committed by

Jens Axboe 1 year ago 6ee78354 751eedc4

+27 -13

2 changed files

expand all

io_uring

io-wq.c

io-wq.h

+21 -12

io_uring/io-wq.c

··· 170 170 } 171 171 172 172 static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, 173 - struct io_wq_work *work) 173 + unsigned int work_flags) 174 174 { 175 - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); 175 + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); 176 176 } 177 177 178 178 static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) ··· 457 457 } 458 458 } 459 459 460 + static inline unsigned int __io_get_work_hash(unsigned int work_flags) 461 + { 462 + return work_flags >> IO_WQ_HASH_SHIFT; 463 + } 464 + 460 465 static inline unsigned int io_get_work_hash(struct io_wq_work *work) 461 466 { 462 - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; 467 + return __io_get_work_hash(atomic_read(&work->flags)); 463 468 } 464 469 465 470 static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) ··· 494 489 struct io_wq *wq = worker->wq; 495 490 496 491 wq_list_for_each(node, prev, &acct->work_list) { 492 + unsigned int work_flags; 497 493 unsigned int hash; 498 494 499 495 work = container_of(node, struct io_wq_work, list); 500 496 501 497 /* not hashed, can run anytime */ 502 - if (!io_wq_is_hashed(work)) { 498 + work_flags = atomic_read(&work->flags); 499 + if (!__io_wq_is_hashed(work_flags)) { 503 500 wq_list_del(&acct->work_list, node, prev); 504 501 return work; 505 502 } 506 503 507 - hash = io_get_work_hash(work); 504 + hash = __io_get_work_hash(work_flags); 508 505 /* all items with this hash lie in [work, tail] */ 509 506 tail = wq->hash_tail[hash]; 510 507 ··· 603 596 /* handle a whole dependent link */ 604 597 do { 605 598 struct io_wq_work *next_hashed, *linked; 606 - unsigned int hash = io_get_work_hash(work); 599 + unsigned int work_flags = atomic_read(&work->flags); 600 + unsigned int hash = __io_get_work_hash(work_flags); 607 601 608 602 next_hashed = wq_next_work(work); 609 603 610 604 if (do_kill && 611 - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) 605 + (work_flags & IO_WQ_WORK_UNBOUND)) 612 606 atomic_or(IO_WQ_WORK_CANCEL, &work->flags); 613 607 wq->do_work(work); 614 608 io_assign_current_work(worker, NULL); ··· 925 917 } while (work); 926 918 } 927 919 928 - static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, struct io_wq_work *work) 920 + static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, 921 + struct io_wq_work *work, unsigned int work_flags) 929 922 { 930 923 unsigned int hash; 931 924 struct io_wq_work *tail; 932 925 933 - if (!io_wq_is_hashed(work)) { 926 + if (!__io_wq_is_hashed(work_flags)) { 934 927 append: 935 928 wq_list_add_tail(&work->list, &acct->work_list); 936 929 return; 937 930 } 938 931 939 - hash = io_get_work_hash(work); 932 + hash = __io_get_work_hash(work_flags); 940 933 tail = wq->hash_tail[hash]; 941 934 wq->hash_tail[hash] = work; 942 935 if (!tail) ··· 953 944 954 945 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) 955 946 { 956 - struct io_wq_acct *acct = io_work_get_acct(wq, work); 957 947 unsigned int work_flags = atomic_read(&work->flags); 948 + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); 958 949 struct io_cb_cancel_data match = { 959 950 .fn = io_wq_work_match_item, 960 951 .data = work, ··· 973 964 } 974 965 975 966 raw_spin_lock(&acct->lock); 976 - io_wq_insert_work(wq, acct, work); 967 + io_wq_insert_work(wq, acct, work, work_flags); 977 968 clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); 978 969 raw_spin_unlock(&acct->lock); 979 970

+6 -1

io_uring/io-wq.h

··· 54 54 int io_wq_max_workers(struct io_wq *wq, int *new_count); 55 55 bool io_wq_worker_stopped(void); 56 56 57 + static inline bool __io_wq_is_hashed(unsigned int work_flags) 58 + { 59 + return work_flags & IO_WQ_WORK_HASHED; 60 + } 61 + 57 62 static inline bool io_wq_is_hashed(struct io_wq_work *work) 58 63 { 59 - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; 64 + return __io_wq_is_hashed(atomic_read(&work->flags)); 60 65 } 61 66 62 67 typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

Configure Feed

Configure Feed