Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring/io-wq: move worker lists to struct io_wq_acct

Have separate linked lists for bounded and unbounded workers. This
way, io_acct_activate_free_worker() sees only workers relevant to it
and doesn't need to skip irrelevant ones. This speeds up the
linked list traversal (under acct->lock).

The `io_wq.lock` field is moved to `io_wq_acct.workers_lock`. It did
not actually protect "access to elements below", that is, not all of
them; it only protected access to the worker lists. By having two
locks instead of one, contention on this lock is reduced.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Link: https://lore.kernel.org/r/20250128133927.3989681-4-max.kellermann@ionos.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Max Kellermann and committed by
Jens Axboe
751eedc4 3d3bafd3

+96 -66
+96 -66
io_uring/io-wq.c
··· 76 76 #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) 77 77 78 78 struct io_wq_acct { 79 + /** 80 + * Protects access to the worker lists. 81 + */ 82 + raw_spinlock_t workers_lock; 83 + 79 84 unsigned nr_workers; 80 85 unsigned max_workers; 81 86 atomic_t nr_running; 87 + 88 + /** 89 + * The list of free workers. Protected by #workers_lock 90 + * (write) and RCU (read). 91 + */ 92 + struct hlist_nulls_head free_list; 93 + 94 + /** 95 + * The list of all workers. Protected by #workers_lock 96 + * (write) and RCU (read). 97 + */ 98 + struct list_head all_list; 99 + 82 100 raw_spinlock_t lock; 83 101 struct io_wq_work_list work_list; 84 102 unsigned long flags; ··· 127 109 struct task_struct *task; 128 110 129 111 struct io_wq_acct acct[IO_WQ_ACCT_NR]; 130 - 131 - /* lock protects access to elements below */ 132 - raw_spinlock_t lock; 133 - 134 - struct hlist_nulls_head free_list; 135 - struct list_head all_list; 136 112 137 113 struct wait_queue_entry wait; 138 114 ··· 202 190 struct io_wq *wq = worker->wq; 203 191 204 192 atomic_dec(&acct->nr_running); 205 - raw_spin_lock(&wq->lock); 193 + raw_spin_lock(&acct->workers_lock); 206 194 acct->nr_workers--; 207 - raw_spin_unlock(&wq->lock); 195 + raw_spin_unlock(&acct->workers_lock); 208 196 io_worker_ref_put(wq); 209 197 clear_bit_unlock(0, &worker->create_state); 210 198 io_worker_release(worker); ··· 223 211 static void io_worker_exit(struct io_worker *worker) 224 212 { 225 213 struct io_wq *wq = worker->wq; 214 + struct io_wq_acct *acct = io_wq_get_acct(worker); 226 215 227 216 while (1) { 228 217 struct callback_head *cb = task_work_cancel_match(wq->task, ··· 237 224 io_worker_release(worker); 238 225 wait_for_completion(&worker->ref_done); 239 226 240 - raw_spin_lock(&wq->lock); 227 + raw_spin_lock(&acct->workers_lock); 241 228 if (test_bit(IO_WORKER_F_FREE, &worker->flags)) 242 229 hlist_nulls_del_rcu(&worker->nulls_node); 243 230 list_del_rcu(&worker->all_list); 244 - raw_spin_unlock(&wq->lock); 231 + raw_spin_unlock(&acct->workers_lock); 245 232 io_wq_dec_running(worker); 246 233 /* 247 234 * this worker is a goner, clear ->worker_private to avoid any ··· 280 267 * Check head of free list for an available worker. If one isn't available, 281 268 * caller must create one. 282 269 */ 283 - static bool io_wq_activate_free_worker(struct io_wq *wq, 284 - struct io_wq_acct *acct) 270 + static bool io_acct_activate_free_worker(struct io_wq_acct *acct) 285 271 __must_hold(RCU) 286 272 { 287 273 struct hlist_nulls_node *n; ··· 291 279 * activate. If a given worker is on the free_list but in the process 292 280 * of exiting, keep trying. 293 281 */ 294 - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { 282 + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { 295 283 if (!io_worker_get(worker)) 296 284 continue; 297 - if (io_wq_get_acct(worker) != acct) { 298 - io_worker_release(worker); 299 - continue; 300 - } 301 285 /* 302 286 * If the worker is already running, it's either already 303 287 * starting work or finishing work. In either case, if it does ··· 320 312 if (unlikely(!acct->max_workers)) 321 313 pr_warn_once("io-wq is not configured for unbound workers"); 322 314 323 - raw_spin_lock(&wq->lock); 315 + raw_spin_lock(&acct->workers_lock); 324 316 if (acct->nr_workers >= acct->max_workers) { 325 - raw_spin_unlock(&wq->lock); 317 + raw_spin_unlock(&acct->workers_lock); 326 318 return true; 327 319 } 328 320 acct->nr_workers++; 329 - raw_spin_unlock(&wq->lock); 321 + raw_spin_unlock(&acct->workers_lock); 330 322 atomic_inc(&acct->nr_running); 331 323 atomic_inc(&wq->worker_refs); 332 324 return create_io_worker(wq, acct); ··· 350 342 worker = container_of(cb, struct io_worker, create_work); 351 343 wq = worker->wq; 352 344 acct = worker->acct; 353 - raw_spin_lock(&wq->lock); 345 + raw_spin_lock(&acct->workers_lock); 354 346 355 347 if (acct->nr_workers < acct->max_workers) { 356 348 acct->nr_workers++; 357 349 do_create = true; 358 350 } 359 - raw_spin_unlock(&wq->lock); 351 + raw_spin_unlock(&acct->workers_lock); 360 352 if (do_create) { 361 353 create_io_worker(wq, acct); 362 354 } else { ··· 435 427 * Worker will start processing some work. Move it to the busy list, if 436 428 * it's currently on the freelist 437 429 */ 438 - static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) 430 + static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) 439 431 { 440 432 if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { 441 433 clear_bit(IO_WORKER_F_FREE, &worker->flags); 442 - raw_spin_lock(&wq->lock); 434 + raw_spin_lock(&acct->workers_lock); 443 435 hlist_nulls_del_init_rcu(&worker->nulls_node); 444 - raw_spin_unlock(&wq->lock); 436 + raw_spin_unlock(&acct->workers_lock); 445 437 } 446 438 } 447 439 448 440 /* 449 441 * No work, worker going to sleep. Move to freelist. 450 442 */ 451 - static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) 452 - __must_hold(wq->lock) 443 + static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) 444 + __must_hold(acct->workers_lock) 453 445 { 454 446 if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { 455 447 set_bit(IO_WORKER_F_FREE, &worker->flags); 456 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 448 + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); 457 449 } 458 450 } 459 451 ··· 588 580 if (!work) 589 581 break; 590 582 591 - __io_worker_busy(wq, worker); 583 + __io_worker_busy(acct, worker); 592 584 593 585 io_assign_current_work(worker, work); 594 586 __set_current_state(TASK_RUNNING); ··· 659 651 while (io_acct_run_queue(acct)) 660 652 io_worker_handle_work(acct, worker); 661 653 662 - raw_spin_lock(&wq->lock); 654 + raw_spin_lock(&acct->workers_lock); 663 655 /* 664 656 * Last sleep timed out. Exit if we're not the last worker, 665 657 * or if someone modified our affinity. 666 658 */ 667 659 if (last_timeout && (exit_mask || acct->nr_workers > 1)) { 668 660 acct->nr_workers--; 669 - raw_spin_unlock(&wq->lock); 661 + raw_spin_unlock(&acct->workers_lock); 670 662 __set_current_state(TASK_RUNNING); 671 663 break; 672 664 } 673 665 last_timeout = false; 674 - __io_worker_idle(wq, worker); 675 - raw_spin_unlock(&wq->lock); 666 + __io_worker_idle(acct, worker); 667 + raw_spin_unlock(&acct->workers_lock); 676 668 if (io_run_task_work()) 677 669 continue; 678 670 ret = schedule_timeout(WORKER_IDLE_TIMEOUT); ··· 733 725 io_wq_dec_running(worker); 734 726 } 735 727 736 - static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, 728 + static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, 737 729 struct task_struct *tsk) 738 730 { 739 731 tsk->worker_private = worker; 740 732 worker->task = tsk; 741 733 set_cpus_allowed_ptr(tsk, wq->cpu_mask); 742 734 743 - raw_spin_lock(&wq->lock); 744 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); 745 - list_add_tail_rcu(&worker->all_list, &wq->all_list); 735 + raw_spin_lock(&acct->workers_lock); 736 + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); 737 + list_add_tail_rcu(&worker->all_list, &acct->all_list); 746 738 set_bit(IO_WORKER_F_FREE, &worker->flags); 747 - raw_spin_unlock(&wq->lock); 739 + raw_spin_unlock(&acct->workers_lock); 748 740 wake_up_new_task(tsk); 749 741 } 750 742 ··· 780 772 struct io_worker *worker; 781 773 struct task_struct *tsk; 782 774 struct io_wq *wq; 775 + struct io_wq_acct *acct; 783 776 784 777 worker = container_of(cb, struct io_worker, create_work); 785 778 clear_bit_unlock(0, &worker->create_state); 786 779 wq = worker->wq; 780 + acct = io_wq_get_acct(worker); 787 781 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 788 782 if (!IS_ERR(tsk)) { 789 - io_init_new_worker(wq, worker, tsk); 783 + io_init_new_worker(wq, acct, worker, tsk); 790 784 io_worker_release(worker); 791 785 return; 792 786 } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 793 - struct io_wq_acct *acct = io_wq_get_acct(worker); 794 - 795 787 atomic_dec(&acct->nr_running); 796 - raw_spin_lock(&wq->lock); 788 + raw_spin_lock(&acct->workers_lock); 797 789 acct->nr_workers--; 798 790 if (!acct->nr_workers) { 799 791 struct io_cb_cancel_data match = { ··· 801 793 .cancel_all = true, 802 794 }; 803 795 804 - raw_spin_unlock(&wq->lock); 796 + raw_spin_unlock(&acct->workers_lock); 805 797 while (io_acct_cancel_pending_work(wq, acct, &match)) 806 798 ; 807 799 } else { 808 - raw_spin_unlock(&wq->lock); 800 + raw_spin_unlock(&acct->workers_lock); 809 801 } 810 802 io_worker_ref_put(wq); 811 803 kfree(worker); ··· 837 829 if (!worker) { 838 830 fail: 839 831 atomic_dec(&acct->nr_running); 840 - raw_spin_lock(&wq->lock); 832 + raw_spin_lock(&acct->workers_lock); 841 833 acct->nr_workers--; 842 - raw_spin_unlock(&wq->lock); 834 + raw_spin_unlock(&acct->workers_lock); 843 835 io_worker_ref_put(wq); 844 836 return false; 845 837 } ··· 852 844 853 845 tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); 854 846 if (!IS_ERR(tsk)) { 855 - io_init_new_worker(wq, worker, tsk); 847 + io_init_new_worker(wq, acct, worker, tsk); 856 848 } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { 857 849 kfree(worker); 858 850 goto fail; ··· 868 860 * Iterate the passed in list and call the specific function for each 869 861 * worker that isn't exiting 870 862 */ 871 - static bool io_wq_for_each_worker(struct io_wq *wq, 872 - bool (*func)(struct io_worker *, void *), 873 - void *data) 863 + static bool io_acct_for_each_worker(struct io_wq_acct *acct, 864 + bool (*func)(struct io_worker *, void *), 865 + void *data) 874 866 { 875 867 struct io_worker *worker; 876 868 bool ret = false; 877 869 878 - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { 870 + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { 879 871 if (io_worker_get(worker)) { 880 872 /* no task if node is/was offline */ 881 873 if (worker->task) ··· 887 879 } 888 880 889 881 return ret; 882 + } 883 + 884 + static bool io_wq_for_each_worker(struct io_wq *wq, 885 + bool (*func)(struct io_worker *, void *), 886 + void *data) 887 + { 888 + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { 889 + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) 890 + return false; 891 + } 892 + 893 + return true; 890 894 } 891 895 892 896 static bool io_wq_worker_wake(struct io_worker *worker, void *data) ··· 969 949 raw_spin_unlock(&acct->lock); 970 950 971 951 rcu_read_lock(); 972 - do_create = !io_wq_activate_free_worker(wq, acct); 952 + do_create = !io_acct_activate_free_worker(acct); 973 953 rcu_read_unlock(); 974 954 975 955 if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || ··· 980 960 if (likely(did_create)) 981 961 return; 982 962 983 - raw_spin_lock(&wq->lock); 963 + raw_spin_lock(&acct->workers_lock); 984 964 if (acct->nr_workers) { 985 - raw_spin_unlock(&wq->lock); 965 + raw_spin_unlock(&acct->workers_lock); 986 966 return; 987 967 } 988 - raw_spin_unlock(&wq->lock); 968 + raw_spin_unlock(&acct->workers_lock); 989 969 990 970 /* fatal condition, failed to create the first worker */ 991 971 io_acct_cancel_pending_work(wq, acct, &match); ··· 1092 1072 } 1093 1073 } 1094 1074 1075 + static void io_acct_cancel_running_work(struct io_wq_acct *acct, 1076 + struct io_cb_cancel_data *match) 1077 + { 1078 + raw_spin_lock(&acct->workers_lock); 1079 + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); 1080 + raw_spin_unlock(&acct->workers_lock); 1081 + } 1082 + 1095 1083 static void io_wq_cancel_running_work(struct io_wq *wq, 1096 1084 struct io_cb_cancel_data *match) 1097 1085 { 1098 1086 rcu_read_lock(); 1099 - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); 1087 + 1088 + for (int i = 0; i < IO_WQ_ACCT_NR; i++) 1089 + io_acct_cancel_running_work(&wq->acct[i], match); 1090 + 1100 1091 rcu_read_unlock(); 1101 1092 } 1102 1093 ··· 1130 1099 * as an indication that we attempt to signal cancellation. The 1131 1100 * completion will run normally in this case. 1132 1101 * 1133 - * Do both of these while holding the wq->lock, to ensure that 1102 + * Do both of these while holding the acct->workers_lock, to ensure that 1134 1103 * we'll find a work item regardless of state. 1135 1104 */ 1136 1105 io_wq_cancel_pending_work(wq, &match); 1137 1106 if (match.nr_pending && !match.cancel_all) 1138 1107 return IO_WQ_CANCEL_OK; 1139 1108 1140 - raw_spin_lock(&wq->lock); 1141 1109 io_wq_cancel_running_work(wq, &match); 1142 - raw_spin_unlock(&wq->lock); 1143 1110 if (match.nr_running && !match.cancel_all) 1144 1111 return IO_WQ_CANCEL_RUNNING; 1145 1112 ··· 1161 1132 struct io_wq_acct *acct = &wq->acct[i]; 1162 1133 1163 1134 if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) 1164 - io_wq_activate_free_worker(wq, acct); 1135 + io_acct_activate_free_worker(acct); 1165 1136 } 1166 1137 rcu_read_unlock(); 1167 1138 return 1; ··· 1200 1171 struct io_wq_acct *acct = &wq->acct[i]; 1201 1172 1202 1173 atomic_set(&acct->nr_running, 0); 1174 + 1175 + raw_spin_lock_init(&acct->workers_lock); 1176 + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); 1177 + INIT_LIST_HEAD(&acct->all_list); 1178 + 1203 1179 INIT_WQ_LIST(&acct->work_list); 1204 1180 raw_spin_lock_init(&acct->lock); 1205 1181 } 1206 - 1207 - raw_spin_lock_init(&wq->lock); 1208 - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); 1209 - INIT_LIST_HEAD(&wq->all_list); 1210 1182 1211 1183 wq->task = get_task_struct(data->task); 1212 1184 atomic_set(&wq->worker_refs, 1); ··· 1394 1364 1395 1365 rcu_read_lock(); 1396 1366 1397 - raw_spin_lock(&wq->lock); 1398 1367 for (i = 0; i < IO_WQ_ACCT_NR; i++) { 1399 1368 acct = &wq->acct[i]; 1369 + raw_spin_lock(&acct->workers_lock); 1400 1370 prev[i] = max_t(int, acct->max_workers, prev[i]); 1401 1371 if (new_count[i]) 1402 1372 acct->max_workers = new_count[i]; 1373 + raw_spin_unlock(&acct->workers_lock); 1403 1374 } 1404 - raw_spin_unlock(&wq->lock); 1405 1375 rcu_read_unlock(); 1406 1376 1407 1377 for (i = 0; i < IO_WQ_ACCT_NR; i++)