Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-5.12-2021-03-05' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
"A bit of a mix between fallout from the worker change, cleanups and
reductions now possible from that change, and fixes in general. In
detail:

- Fully serialize manager and worker creation, fixing races due to
that.

- Clean up some naming that had gone stale.

- SQPOLL fixes.

- Fix race condition around task_work rework that went into this
merge window.

- Implement unshare. Used for when the original task does unshare(2)
or setuid/seteuid and friends, drops the original workers and forks
new ones.

- Drop the only remaining piece of state shuffling we had left, which
was cred. Move it into issue instead, and we can drop all of that
code too.

- Kill f_op->flush() usage. That was such a nasty hack that we had
out of necessity, we no longer need it.

- Following from ->flush() removal, we can also drop various bits of
ctx state related to SQPOLL and cancelations.

- Fix an issue with IOPOLL retry, which originally was fallout from a
filemap change (removing iov_iter_revert()), but uncovered an issue
with iovec re-import too late.

- Fix an issue with system suspend.

- Use xchg() for fallback work, instead of cmpxchg().

- Properly destroy io-wq on exec.

- Add create_io_thread() core helper, and use that in io-wq and
io_uring. This allows us to remove various silly completion events
related to thread setup.

- A few error handling fixes.

This should be the grunt of fixes necessary for the new workers, next
week should be quieter. We've got a pending series from Pavel on
cancelations, and how tasks and rings are indexed. Outside of that,
should just be minor fixes. Even with these fixes, we're still killing
a net ~80 lines"

* tag 'io_uring-5.12-2021-03-05' of git://git.kernel.dk/linux-block: (41 commits)
io_uring: don't restrict issue_flags for io_openat
io_uring: make SQPOLL thread parking saner
io-wq: kill hashed waitqueue before manager exits
io_uring: clear IOCB_WAITQ for non -EIOCBQUEUED return
io_uring: don't keep looping for more events if we can't flush overflow
io_uring: move to using create_io_thread()
kernel: provide create_io_thread() helper
io_uring: reliably cancel linked timeouts
io_uring: cancel-match based on flags
io-wq: ensure all pending work is canceled on exit
io_uring: ensure that threads freeze on suspend
io_uring: remove extra in_idle wake up
io_uring: inline __io_queue_async_work()
io_uring: inline io_req_clean_work()
io_uring: choose right tctx->io_wq for try cancel
io_uring: fix -EAGAIN retry with IOPOLL
io-wq: fix error path leak of buffered write hash map
io_uring: remove sqo_task
io_uring: kill sqo_dead and sqo submission halting
io_uring: ignore double poll add on the same waitqueue head
...

+362 -440
+121 -140
fs/io-wq.c
··· 16 16 #include <linux/rculist_nulls.h> 17 17 #include <linux/cpu.h> 18 18 #include <linux/tracehook.h> 19 + #include <linux/freezer.h> 19 20 20 21 #include "../kernel/sched/sched.h" 21 22 #include "io-wq.h" ··· 52 51 53 52 struct io_wq_work *cur_work; 54 53 spinlock_t lock; 55 - 56 - const struct cred *cur_creds; 57 - const struct cred *saved_creds; 58 54 59 55 struct completion ref_done; 60 56 ··· 115 117 struct io_wq_hash *hash; 116 118 117 119 refcount_t refs; 118 - struct completion done; 120 + struct completion exited; 121 + 122 + atomic_t worker_refs; 123 + struct completion worker_done; 119 124 120 125 struct hlist_node cpuhp_node; 121 126 ··· 126 125 }; 127 126 128 127 static enum cpuhp_state io_wq_online; 128 + 129 + struct io_cb_cancel_data { 130 + work_cancel_fn *fn; 131 + void *data; 132 + int nr_running; 133 + int nr_pending; 134 + bool cancel_all; 135 + }; 136 + 137 + static void io_wqe_cancel_pending_work(struct io_wqe *wqe, 138 + struct io_cb_cancel_data *match); 129 139 130 140 static bool io_worker_get(struct io_worker *worker) 131 141 { ··· 187 175 worker->flags = 0; 188 176 preempt_enable(); 189 177 190 - if (worker->saved_creds) { 191 - revert_creds(worker->saved_creds); 192 - worker->cur_creds = worker->saved_creds = NULL; 193 - } 194 - 195 178 raw_spin_lock_irq(&wqe->lock); 196 179 if (flags & IO_WORKER_F_FREE) 197 180 hlist_nulls_del_rcu(&worker->nulls_node); ··· 195 188 raw_spin_unlock_irq(&wqe->lock); 196 189 197 190 kfree_rcu(worker, rcu); 198 - io_wq_put(wqe->wq); 191 + if (atomic_dec_and_test(&wqe->wq->worker_refs)) 192 + complete(&wqe->wq->worker_done); 193 + do_exit(0); 199 194 } 200 195 201 196 static inline bool io_wqe_run_queue(struct io_wqe *wqe) ··· 272 263 io_wqe_wake_worker(wqe, acct); 273 264 } 274 265 275 - static void io_worker_start(struct io_worker *worker) 276 - { 277 - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); 278 - io_wqe_inc_running(worker); 279 - } 280 - 281 266 /* 282 267 * Worker will start processing some work. Move it to the busy list, if 283 268 * it's currently on the freelist ··· 321 318 if (!(worker->flags & IO_WORKER_F_FREE)) { 322 319 worker->flags |= IO_WORKER_F_FREE; 323 320 hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); 324 - } 325 - if (worker->saved_creds) { 326 - revert_creds(worker->saved_creds); 327 - worker->cur_creds = worker->saved_creds = NULL; 328 321 } 329 322 } 330 323 ··· 396 397 } 397 398 } 398 399 399 - static void io_wq_switch_creds(struct io_worker *worker, 400 - struct io_wq_work *work) 401 - { 402 - const struct cred *old_creds = override_creds(work->creds); 403 - 404 - worker->cur_creds = work->creds; 405 - if (worker->saved_creds) 406 - put_cred(old_creds); /* creds set by previous switch */ 407 - else 408 - worker->saved_creds = old_creds; 409 - } 410 - 411 400 static void io_assign_current_work(struct io_worker *worker, 412 401 struct io_wq_work *work) 413 402 { ··· 445 458 unsigned int hash = io_get_work_hash(work); 446 459 447 460 next_hashed = wq_next_work(work); 448 - if (work->creds && worker->cur_creds != work->creds) 449 - io_wq_switch_creds(worker, work); 450 461 wq->do_work(work); 451 462 io_assign_current_work(worker, NULL); 452 463 ··· 480 495 struct io_worker *worker = data; 481 496 struct io_wqe *wqe = worker->wqe; 482 497 struct io_wq *wq = wqe->wq; 498 + char buf[TASK_COMM_LEN]; 483 499 484 - io_worker_start(worker); 500 + worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); 501 + io_wqe_inc_running(worker); 502 + 503 + sprintf(buf, "iou-wrk-%d", wq->task_pid); 504 + set_task_comm(current, buf); 485 505 486 506 while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { 487 507 set_current_state(TASK_INTERRUPTIBLE); ··· 561 571 raw_spin_unlock_irq(&worker->wqe->lock); 562 572 } 563 573 564 - static int task_thread(void *data, int index) 565 - { 566 - struct io_worker *worker = data; 567 - struct io_wqe *wqe = worker->wqe; 568 - struct io_wqe_acct *acct = &wqe->acct[index]; 569 - struct io_wq *wq = wqe->wq; 570 - char buf[TASK_COMM_LEN]; 571 - 572 - sprintf(buf, "iou-wrk-%d", wq->task_pid); 573 - set_task_comm(current, buf); 574 - 575 - current->pf_io_worker = worker; 576 - worker->task = current; 577 - 578 - set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node)); 579 - current->flags |= PF_NO_SETAFFINITY; 580 - 581 - raw_spin_lock_irq(&wqe->lock); 582 - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); 583 - list_add_tail_rcu(&worker->all_list, &wqe->all_list); 584 - worker->flags |= IO_WORKER_F_FREE; 585 - if (index == IO_WQ_ACCT_BOUND) 586 - worker->flags |= IO_WORKER_F_BOUND; 587 - if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) 588 - worker->flags |= IO_WORKER_F_FIXED; 589 - acct->nr_workers++; 590 - raw_spin_unlock_irq(&wqe->lock); 591 - 592 - io_wqe_worker(data); 593 - do_exit(0); 594 - } 595 - 596 - static int task_thread_bound(void *data) 597 - { 598 - return task_thread(data, IO_WQ_ACCT_BOUND); 599 - } 600 - 601 - static int task_thread_unbound(void *data) 602 - { 603 - return task_thread(data, IO_WQ_ACCT_UNBOUND); 604 - } 605 - 606 - pid_t io_wq_fork_thread(int (*fn)(void *), void *arg) 607 - { 608 - unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| 609 - CLONE_IO|SIGCHLD; 610 - struct kernel_clone_args args = { 611 - .flags = ((lower_32_bits(flags) | CLONE_VM | 612 - CLONE_UNTRACED) & ~CSIGNAL), 613 - .exit_signal = (lower_32_bits(flags) & CSIGNAL), 614 - .stack = (unsigned long)fn, 615 - .stack_size = (unsigned long)arg, 616 - }; 617 - 618 - return kernel_clone(&args); 619 - } 620 - 621 574 static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) 622 575 { 576 + struct io_wqe_acct *acct = &wqe->acct[index]; 623 577 struct io_worker *worker; 624 - pid_t pid; 578 + struct task_struct *tsk; 625 579 626 580 __set_current_state(TASK_RUNNING); 627 581 ··· 579 645 spin_lock_init(&worker->lock); 580 646 init_completion(&worker->ref_done); 581 647 582 - refcount_inc(&wq->refs); 648 + atomic_inc(&wq->worker_refs); 583 649 584 - if (index == IO_WQ_ACCT_BOUND) 585 - pid = io_wq_fork_thread(task_thread_bound, worker); 586 - else 587 - pid = io_wq_fork_thread(task_thread_unbound, worker); 588 - if (pid < 0) { 589 - io_wq_put(wq); 650 + tsk = create_io_thread(io_wqe_worker, worker, wqe->node); 651 + if (IS_ERR(tsk)) { 652 + if (atomic_dec_and_test(&wq->worker_refs)) 653 + complete(&wq->worker_done); 590 654 kfree(worker); 591 655 return false; 592 656 } 657 + 658 + tsk->pf_io_worker = worker; 659 + worker->task = tsk; 660 + set_cpus_allowed_ptr(tsk, cpumask_of_node(wqe->node)); 661 + tsk->flags |= PF_NOFREEZE | PF_NO_SETAFFINITY; 662 + 663 + raw_spin_lock_irq(&wqe->lock); 664 + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); 665 + list_add_tail_rcu(&worker->all_list, &wqe->all_list); 666 + worker->flags |= IO_WORKER_F_FREE; 667 + if (index == IO_WQ_ACCT_BOUND) 668 + worker->flags |= IO_WORKER_F_BOUND; 669 + if (!acct->nr_workers && (worker->flags & IO_WORKER_F_BOUND)) 670 + worker->flags |= IO_WORKER_F_FIXED; 671 + acct->nr_workers++; 672 + raw_spin_unlock_irq(&wqe->lock); 673 + wake_up_new_task(tsk); 593 674 return true; 594 675 } 595 676 ··· 613 664 { 614 665 struct io_wqe_acct *acct = &wqe->acct[index]; 615 666 667 + if (acct->nr_workers && test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) 668 + return false; 616 669 /* if we have available workers or no work, no need */ 617 670 if (!hlist_nulls_empty(&wqe->free_list) || !io_wqe_run_queue(wqe)) 618 671 return false; ··· 648 697 649 698 static bool io_wq_worker_wake(struct io_worker *worker, void *data) 650 699 { 700 + set_notify_signal(worker->task); 651 701 wake_up_process(worker->task); 652 702 return false; 653 703 } ··· 677 725 } 678 726 } 679 727 728 + static bool io_wq_work_match_all(struct io_wq_work *work, void *data) 729 + { 730 + return true; 731 + } 732 + 733 + static void io_wq_cancel_pending(struct io_wq *wq) 734 + { 735 + struct io_cb_cancel_data match = { 736 + .fn = io_wq_work_match_all, 737 + .cancel_all = true, 738 + }; 739 + int node; 740 + 741 + for_each_node(node) 742 + io_wqe_cancel_pending_work(wq->wqes[node], &match); 743 + } 744 + 680 745 /* 681 746 * Manager thread. Tasked with creating new workers, if we need them. 682 747 */ ··· 701 732 { 702 733 struct io_wq *wq = data; 703 734 char buf[TASK_COMM_LEN]; 735 + int node; 704 736 705 737 sprintf(buf, "iou-mgr-%d", wq->task_pid); 706 738 set_task_comm(current, buf); 707 - current->flags |= PF_IO_WORKER; 708 - wq->manager = current; 709 - 710 - complete(&wq->done); 711 739 712 740 do { 713 741 set_current_state(TASK_INTERRUPTIBLE); 714 742 io_wq_check_workers(wq); 715 743 schedule_timeout(HZ); 744 + try_to_freeze(); 716 745 if (fatal_signal_pending(current)) 717 746 set_bit(IO_WQ_BIT_EXIT, &wq->state); 718 747 } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); 719 748 720 749 io_wq_check_workers(wq); 721 - wq->manager = NULL; 722 - io_wq_put(wq); 750 + 751 + rcu_read_lock(); 752 + for_each_node(node) 753 + io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); 754 + rcu_read_unlock(); 755 + 756 + /* we might not ever have created any workers */ 757 + if (atomic_read(&wq->worker_refs)) 758 + wait_for_completion(&wq->worker_done); 759 + 760 + spin_lock_irq(&wq->hash->wait.lock); 761 + for_each_node(node) 762 + list_del_init(&wq->wqes[node]->wait.entry); 763 + spin_unlock_irq(&wq->hash->wait.lock); 764 + 765 + io_wq_cancel_pending(wq); 766 + complete(&wq->exited); 723 767 do_exit(0); 724 768 } 725 769 ··· 769 787 770 788 static int io_wq_fork_manager(struct io_wq *wq) 771 789 { 772 - int ret; 790 + struct task_struct *tsk; 773 791 774 792 if (wq->manager) 775 793 return 0; 776 794 777 - clear_bit(IO_WQ_BIT_EXIT, &wq->state); 778 - refcount_inc(&wq->refs); 779 - current->flags |= PF_IO_WORKER; 780 - ret = io_wq_fork_thread(io_wq_manager, wq); 781 - current->flags &= ~PF_IO_WORKER; 782 - if (ret >= 0) { 783 - wait_for_completion(&wq->done); 795 + reinit_completion(&wq->worker_done); 796 + tsk = create_io_thread(io_wq_manager, wq, NUMA_NO_NODE); 797 + if (!IS_ERR(tsk)) { 798 + wq->manager = get_task_struct(tsk); 799 + wake_up_new_task(tsk); 784 800 return 0; 785 801 } 786 802 787 - io_wq_put(wq); 788 - return ret; 803 + return PTR_ERR(tsk); 789 804 } 790 805 791 806 static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) ··· 792 813 unsigned long flags; 793 814 794 815 /* Can only happen if manager creation fails after exec */ 795 - if (unlikely(io_wq_fork_manager(wqe->wq))) { 816 + if (io_wq_fork_manager(wqe->wq) || 817 + test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state)) { 796 818 work->flags |= IO_WQ_WORK_CANCEL; 797 819 wqe->wq->do_work(work); 798 820 return; ··· 828 848 bit = hash_ptr(val, IO_WQ_HASH_ORDER); 829 849 work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); 830 850 } 831 - 832 - struct io_cb_cancel_data { 833 - work_cancel_fn *fn; 834 - void *data; 835 - int nr_running; 836 - int nr_pending; 837 - bool cancel_all; 838 - }; 839 851 840 852 static bool io_wq_worker_cancel(struct io_worker *worker, void *data) 841 853 { ··· 1015 1043 } 1016 1044 1017 1045 wq->task_pid = current->pid; 1018 - init_completion(&wq->done); 1046 + init_completion(&wq->exited); 1019 1047 refcount_set(&wq->refs, 1); 1048 + 1049 + init_completion(&wq->worker_done); 1050 + atomic_set(&wq->worker_refs, 0); 1020 1051 1021 1052 ret = io_wq_fork_manager(wq); 1022 1053 if (!ret) 1023 1054 return wq; 1024 1055 1025 - io_wq_put(wq); 1026 - io_wq_put_hash(data->hash); 1027 1056 err: 1057 + io_wq_put_hash(data->hash); 1028 1058 cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); 1029 1059 for_each_node(node) 1030 1060 kfree(wq->wqes[node]); ··· 1037 1063 return ERR_PTR(ret); 1038 1064 } 1039 1065 1066 + static void io_wq_destroy_manager(struct io_wq *wq) 1067 + { 1068 + if (wq->manager) { 1069 + wake_up_process(wq->manager); 1070 + wait_for_completion(&wq->exited); 1071 + put_task_struct(wq->manager); 1072 + wq->manager = NULL; 1073 + } 1074 + } 1075 + 1040 1076 static void io_wq_destroy(struct io_wq *wq) 1041 1077 { 1042 1078 int node; ··· 1054 1070 cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); 1055 1071 1056 1072 set_bit(IO_WQ_BIT_EXIT, &wq->state); 1057 - if (wq->manager) 1058 - wake_up_process(wq->manager); 1073 + io_wq_destroy_manager(wq); 1059 1074 1060 - rcu_read_lock(); 1061 - for_each_node(node) 1062 - io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); 1063 - rcu_read_unlock(); 1064 - 1065 - spin_lock_irq(&wq->hash->wait.lock); 1066 1075 for_each_node(node) { 1067 1076 struct io_wqe *wqe = wq->wqes[node]; 1068 - 1069 - list_del_init(&wqe->wait.entry); 1077 + WARN_ON_ONCE(!wq_list_empty(&wqe->work_list)); 1070 1078 kfree(wqe); 1071 1079 } 1072 - spin_unlock_irq(&wq->hash->wait.lock); 1073 1080 io_wq_put_hash(wq->hash); 1074 1081 kfree(wq->wqes); 1075 1082 kfree(wq); 1076 - 1077 1083 } 1078 1084 1079 1085 void io_wq_put(struct io_wq *wq) 1080 1086 { 1081 1087 if (refcount_dec_and_test(&wq->refs)) 1082 1088 io_wq_destroy(wq); 1089 + } 1090 + 1091 + void io_wq_put_and_exit(struct io_wq *wq) 1092 + { 1093 + set_bit(IO_WQ_BIT_EXIT, &wq->state); 1094 + io_wq_destroy_manager(wq); 1095 + io_wq_put(wq); 1083 1096 } 1084 1097 1085 1098 static bool io_wq_worker_affinity(struct io_worker *worker, void *data)
+2 -3
fs/io-wq.h
··· 79 79 80 80 struct io_wq_work { 81 81 struct io_wq_work_node list; 82 - const struct cred *creds; 83 82 unsigned flags; 83 + unsigned short personality; 84 84 }; 85 85 86 86 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) ··· 114 114 115 115 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); 116 116 void io_wq_put(struct io_wq *wq); 117 + void io_wq_put_and_exit(struct io_wq *wq); 117 118 118 119 void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); 119 120 void io_wq_hash_work(struct io_wq_work *work, void *val); 120 - 121 - pid_t io_wq_fork_thread(int (*fn)(void *), void *arg); 122 121 123 122 static inline bool io_wq_is_hashed(struct io_wq_work *work) 124 123 {
+206 -296
fs/io_uring.c
··· 74 74 #include <linux/fsnotify.h> 75 75 #include <linux/fadvise.h> 76 76 #include <linux/eventpoll.h> 77 - #include <linux/fs_struct.h> 78 77 #include <linux/splice.h> 79 78 #include <linux/task_work.h> 80 79 #include <linux/pagemap.h> 81 80 #include <linux/io_uring.h> 82 - #include <linux/blk-cgroup.h> 83 - #include <linux/audit.h> 81 + #include <linux/freezer.h> 84 82 85 83 #define CREATE_TRACE_POINTS 86 84 #include <trace/events/io_uring.h> ··· 274 276 275 277 unsigned long state; 276 278 struct completion startup; 277 - struct completion completion; 279 + struct completion parked; 278 280 struct completion exited; 279 281 }; 280 282 ··· 336 338 unsigned int drain_next: 1; 337 339 unsigned int eventfd_async: 1; 338 340 unsigned int restricted: 1; 339 - unsigned int sqo_dead: 1; 340 341 unsigned int sqo_exec: 1; 341 342 342 343 /* ··· 376 379 struct io_submit_state submit_state; 377 380 378 381 struct io_rings *rings; 379 - 380 - /* 381 - * For SQPOLL usage 382 - */ 383 - struct task_struct *sqo_task; 384 382 385 383 /* Only used for accounting purposes */ 386 384 struct mm_struct *mm_account; ··· 680 688 REQ_F_POLLED_BIT, 681 689 REQ_F_BUFFER_SELECTED_BIT, 682 690 REQ_F_NO_FILE_TABLE_BIT, 683 - REQ_F_WORK_INITIALIZED_BIT, 684 691 REQ_F_LTIMEOUT_ACTIVE_BIT, 685 692 REQ_F_COMPLETE_INLINE_BIT, 686 693 ··· 703 712 704 713 /* fail rest of links */ 705 714 REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT), 706 - /* on inflight list */ 715 + /* on inflight list, should be cancelled and waited on exit reliably */ 707 716 REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), 708 717 /* read/write uses file position */ 709 718 REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), ··· 721 730 REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), 722 731 /* doesn't need file table for this request */ 723 732 REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), 724 - /* io_wq_work is initialized */ 725 - REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), 726 733 /* linked timeout is active, i.e. prepared by link's head */ 727 734 REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), 728 735 /* completion is deferred through io_comp_state */ ··· 1069 1080 return true; 1070 1081 1071 1082 io_for_each_link(req, head) { 1072 - if (!(req->flags & REQ_F_WORK_INITIALIZED)) 1073 - continue; 1074 - if (req->file && req->file->f_op == &io_uring_fops) 1083 + if (req->flags & REQ_F_INFLIGHT) 1075 1084 return true; 1076 1085 if (req->task->files == files) 1077 1086 return true; ··· 1081 1094 { 1082 1095 if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) 1083 1096 req->flags |= REQ_F_FAIL_LINK; 1084 - } 1085 - 1086 - static inline void __io_req_init_async(struct io_kiocb *req) 1087 - { 1088 - memset(&req->work, 0, sizeof(req->work)); 1089 - req->flags |= REQ_F_WORK_INITIALIZED; 1090 - } 1091 - 1092 - /* 1093 - * Note: must call io_req_init_async() for the first time you 1094 - * touch any members of io_wq_work. 1095 - */ 1096 - static inline void io_req_init_async(struct io_kiocb *req) 1097 - { 1098 - if (req->flags & REQ_F_WORK_INITIALIZED) 1099 - return; 1100 - 1101 - __io_req_init_async(req); 1102 1097 } 1103 1098 1104 1099 static void io_ring_ctx_ref_free(struct percpu_ref *ref) ··· 1165 1196 return false; 1166 1197 } 1167 1198 1168 - static void io_req_clean_work(struct io_kiocb *req) 1169 - { 1170 - if (!(req->flags & REQ_F_WORK_INITIALIZED)) 1171 - return; 1172 - 1173 - if (req->work.creds) { 1174 - put_cred(req->work.creds); 1175 - req->work.creds = NULL; 1176 - } 1177 - if (req->flags & REQ_F_INFLIGHT) { 1178 - struct io_ring_ctx *ctx = req->ctx; 1179 - struct io_uring_task *tctx = req->task->io_uring; 1180 - unsigned long flags; 1181 - 1182 - spin_lock_irqsave(&ctx->inflight_lock, flags); 1183 - list_del(&req->inflight_entry); 1184 - spin_unlock_irqrestore(&ctx->inflight_lock, flags); 1185 - req->flags &= ~REQ_F_INFLIGHT; 1186 - if (atomic_read(&tctx->in_idle)) 1187 - wake_up(&tctx->wait); 1188 - } 1189 - 1190 - req->flags &= ~REQ_F_WORK_INITIALIZED; 1191 - } 1192 - 1193 1199 static void io_req_track_inflight(struct io_kiocb *req) 1194 1200 { 1195 1201 struct io_ring_ctx *ctx = req->ctx; 1196 1202 1197 1203 if (!(req->flags & REQ_F_INFLIGHT)) { 1198 - io_req_init_async(req); 1199 1204 req->flags |= REQ_F_INFLIGHT; 1200 1205 1201 1206 spin_lock_irq(&ctx->inflight_lock); ··· 1183 1240 const struct io_op_def *def = &io_op_defs[req->opcode]; 1184 1241 struct io_ring_ctx *ctx = req->ctx; 1185 1242 1186 - io_req_init_async(req); 1187 - 1188 1243 if (req->flags & REQ_F_FORCE_ASYNC) 1189 1244 req->work.flags |= IO_WQ_WORK_CONCURRENT; 1190 1245 ··· 1193 1252 if (def->unbound_nonreg_file) 1194 1253 req->work.flags |= IO_WQ_WORK_UNBOUND; 1195 1254 } 1196 - if (!req->work.creds) 1197 - req->work.creds = get_current_cred(); 1198 1255 } 1199 1256 1200 1257 static void io_prep_async_link(struct io_kiocb *req) ··· 1203 1264 io_prep_async_work(cur); 1204 1265 } 1205 1266 1206 - static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) 1267 + static void io_queue_async_work(struct io_kiocb *req) 1207 1268 { 1208 1269 struct io_ring_ctx *ctx = req->ctx; 1209 1270 struct io_kiocb *link = io_prep_linked_timeout(req); ··· 1214 1275 1215 1276 trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, 1216 1277 &req->work, req->flags); 1217 - io_wq_enqueue(tctx->io_wq, &req->work); 1218 - return link; 1219 - } 1220 - 1221 - static void io_queue_async_work(struct io_kiocb *req) 1222 - { 1223 - struct io_kiocb *link; 1224 - 1225 1278 /* init ->work of the whole link before punting */ 1226 1279 io_prep_async_link(req); 1227 - link = __io_queue_async_work(req); 1228 - 1280 + io_wq_enqueue(tctx->io_wq, &req->work); 1229 1281 if (link) 1230 1282 io_queue_linked_timeout(link); 1231 1283 } ··· 1451 1521 return all_flushed; 1452 1522 } 1453 1523 1454 - static void io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1524 + static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force, 1455 1525 struct task_struct *tsk, 1456 1526 struct files_struct *files) 1457 1527 { 1528 + bool ret = true; 1529 + 1458 1530 if (test_bit(0, &ctx->cq_check_overflow)) { 1459 1531 /* iopoll syncs against uring_lock, not completion_lock */ 1460 1532 if (ctx->flags & IORING_SETUP_IOPOLL) 1461 1533 mutex_lock(&ctx->uring_lock); 1462 - __io_cqring_overflow_flush(ctx, force, tsk, files); 1534 + ret = __io_cqring_overflow_flush(ctx, force, tsk, files); 1463 1535 if (ctx->flags & IORING_SETUP_IOPOLL) 1464 1536 mutex_unlock(&ctx->uring_lock); 1465 1537 } 1538 + 1539 + return ret; 1466 1540 } 1467 1541 1468 1542 static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) ··· 1648 1714 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1649 1715 if (req->fixed_rsrc_refs) 1650 1716 percpu_ref_put(req->fixed_rsrc_refs); 1651 - io_req_clean_work(req); 1717 + 1718 + if (req->flags & REQ_F_INFLIGHT) { 1719 + struct io_ring_ctx *ctx = req->ctx; 1720 + unsigned long flags; 1721 + 1722 + spin_lock_irqsave(&ctx->inflight_lock, flags); 1723 + list_del(&req->inflight_entry); 1724 + spin_unlock_irqrestore(&ctx->inflight_lock, flags); 1725 + req->flags &= ~REQ_F_INFLIGHT; 1726 + } 1652 1727 } 1653 1728 1729 + /* must to be called somewhat shortly after putting a request */ 1654 1730 static inline void io_put_task(struct task_struct *task, int nr) 1655 1731 { 1656 1732 struct io_uring_task *tctx = task->io_uring; ··· 1744 1800 trace_io_uring_fail_link(req, link); 1745 1801 io_cqring_fill_event(link, -ECANCELED); 1746 1802 1747 - /* 1748 - * It's ok to free under spinlock as they're not linked anymore, 1749 - * but avoid REQ_F_WORK_INITIALIZED because it may deadlock on 1750 - * work.fs->lock. 1751 - */ 1752 - if (link->flags & REQ_F_WORK_INITIALIZED) 1753 - io_put_req_deferred(link, 2); 1754 - else 1755 - io_double_put_req(link); 1803 + io_put_req_deferred(link, 2); 1756 1804 link = nxt; 1757 1805 } 1758 1806 io_commit_cqring(ctx); ··· 1781 1845 return __io_req_find_next(req); 1782 1846 } 1783 1847 1848 + static void ctx_flush_and_put(struct io_ring_ctx *ctx) 1849 + { 1850 + if (!ctx) 1851 + return; 1852 + if (ctx->submit_state.comp.nr) { 1853 + mutex_lock(&ctx->uring_lock); 1854 + io_submit_flush_completions(&ctx->submit_state.comp, ctx); 1855 + mutex_unlock(&ctx->uring_lock); 1856 + } 1857 + percpu_ref_put(&ctx->refs); 1858 + } 1859 + 1784 1860 static bool __tctx_task_work(struct io_uring_task *tctx) 1785 1861 { 1786 1862 struct io_ring_ctx *ctx = NULL; ··· 1810 1862 node = list.first; 1811 1863 while (node) { 1812 1864 struct io_wq_work_node *next = node->next; 1813 - struct io_ring_ctx *this_ctx; 1814 1865 struct io_kiocb *req; 1815 1866 1816 1867 req = container_of(node, struct io_kiocb, io_task_work.node); 1817 - this_ctx = req->ctx; 1868 + if (req->ctx != ctx) { 1869 + ctx_flush_and_put(ctx); 1870 + ctx = req->ctx; 1871 + percpu_ref_get(&ctx->refs); 1872 + } 1873 + 1818 1874 req->task_work.func(&req->task_work); 1819 1875 node = next; 1820 - 1821 - if (!ctx) { 1822 - ctx = this_ctx; 1823 - } else if (ctx != this_ctx) { 1824 - mutex_lock(&ctx->uring_lock); 1825 - io_submit_flush_completions(&ctx->submit_state.comp, ctx); 1826 - mutex_unlock(&ctx->uring_lock); 1827 - ctx = this_ctx; 1828 - } 1829 1876 } 1830 1877 1831 - if (ctx && ctx->submit_state.comp.nr) { 1832 - mutex_lock(&ctx->uring_lock); 1833 - io_submit_flush_completions(&ctx->submit_state.comp, ctx); 1834 - mutex_unlock(&ctx->uring_lock); 1835 - } 1836 - 1878 + ctx_flush_and_put(ctx); 1837 1879 return list.first != NULL; 1838 1880 } 1839 1881 ··· 1831 1893 { 1832 1894 struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); 1833 1895 1896 + clear_bit(0, &tctx->task_state); 1897 + 1834 1898 while (__tctx_task_work(tctx)) 1835 1899 cond_resched(); 1836 - 1837 - clear_bit(0, &tctx->task_state); 1838 1900 } 1839 1901 1840 1902 static int io_task_work_add(struct task_struct *tsk, struct io_kiocb *req, ··· 1948 2010 1949 2011 /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ 1950 2012 mutex_lock(&ctx->uring_lock); 1951 - if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve) 2013 + if (!(current->flags & PF_EXITING) && !current->in_execve) 1952 2014 __io_queue_sqe(req); 1953 2015 else 1954 2016 __io_req_task_cancel(req, -EFAULT); ··· 2410 2472 return false; 2411 2473 return !io_setup_async_rw(req, iovec, inline_vecs, &iter, false); 2412 2474 } 2413 - #endif 2414 2475 2415 - static bool io_rw_reissue(struct io_kiocb *req) 2476 + static bool io_rw_should_reissue(struct io_kiocb *req) 2416 2477 { 2417 - #ifdef CONFIG_BLOCK 2418 2478 umode_t mode = file_inode(req->file)->i_mode; 2479 + struct io_ring_ctx *ctx = req->ctx; 2419 2480 2420 2481 if (!S_ISBLK(mode) && !S_ISREG(mode)) 2421 2482 return false; 2422 - if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker()) 2483 + if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && 2484 + !(ctx->flags & IORING_SETUP_IOPOLL))) 2423 2485 return false; 2424 2486 /* 2425 2487 * If ref is dying, we might be running poll reap from the exit work. 2426 2488 * Don't attempt to reissue from that path, just let it fail with 2427 2489 * -EAGAIN. 2428 2490 */ 2429 - if (percpu_ref_is_dying(&req->ctx->refs)) 2491 + if (percpu_ref_is_dying(&ctx->refs)) 2492 + return false; 2493 + return true; 2494 + } 2495 + #endif 2496 + 2497 + static bool io_rw_reissue(struct io_kiocb *req) 2498 + { 2499 + #ifdef CONFIG_BLOCK 2500 + if (!io_rw_should_reissue(req)) 2430 2501 return false; 2431 2502 2432 2503 lockdep_assert_held(&req->ctx->uring_lock); ··· 2477 2530 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) 2478 2531 { 2479 2532 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); 2533 + 2534 + #ifdef CONFIG_BLOCK 2535 + /* Rewind iter, if we have one. iopoll path resubmits as usual */ 2536 + if (res == -EAGAIN && io_rw_should_reissue(req)) { 2537 + struct io_async_rw *rw = req->async_data; 2538 + 2539 + if (rw) 2540 + iov_iter_revert(&rw->iter, 2541 + req->result - iov_iter_count(&rw->iter)); 2542 + else if (!io_resubmit_prep(req)) 2543 + res = -EIO; 2544 + } 2545 + #endif 2480 2546 2481 2547 if (kiocb->ki_flags & IOCB_WRITE) 2482 2548 kiocb_end_write(req); ··· 3239 3279 ret = io_iter_do_read(req, iter); 3240 3280 3241 3281 if (ret == -EIOCBQUEUED) { 3282 + if (req->async_data) 3283 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3242 3284 goto out_free; 3243 3285 } else if (ret == -EAGAIN) { 3244 3286 /* IOPOLL retry should happen for io-wq threads */ ··· 3286 3324 if (ret == -EIOCBQUEUED) 3287 3325 return 0; 3288 3326 /* we got some bytes, but not all. retry. */ 3327 + kiocb->ki_flags &= ~IOCB_WAITQ; 3289 3328 } while (ret > 0 && ret < io_size); 3290 3329 done: 3291 3330 kiocb_done(kiocb, ret, issue_flags); ··· 3373 3410 /* no retry on NONBLOCK nor RWF_NOWAIT */ 3374 3411 if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) 3375 3412 goto done; 3413 + if (ret2 == -EIOCBQUEUED && req->async_data) 3414 + iov_iter_revert(iter, io_size - iov_iter_count(iter)); 3376 3415 if (!force_nonblock || ret2 != -EAGAIN) { 3377 3416 /* IOPOLL retry should happen for io-wq threads */ 3378 3417 if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) ··· 3553 3588 * Splice operation will be punted aync, and here need to 3554 3589 * modify io_wq_work.flags, so initialize io_wq_work firstly. 3555 3590 */ 3556 - io_req_init_async(req); 3557 3591 req->work.flags |= IO_WQ_WORK_UNBOUND; 3558 3592 } 3559 3593 ··· 3828 3864 3829 3865 static int io_openat(struct io_kiocb *req, unsigned int issue_flags) 3830 3866 { 3831 - return io_openat2(req, issue_flags & IO_URING_F_NONBLOCK); 3867 + return io_openat2(req, issue_flags); 3832 3868 } 3833 3869 3834 3870 static int io_remove_buffers_prep(struct io_kiocb *req, ··· 4967 5003 pt->error = -EINVAL; 4968 5004 return; 4969 5005 } 5006 + /* double add on the same waitqueue head, ignore */ 5007 + if (poll->head == head) 5008 + return; 4970 5009 poll = kmalloc(sizeof(*poll), GFP_ATOMIC); 4971 5010 if (!poll) { 4972 5011 pt->error = -ENOMEM; ··· 5505 5538 5506 5539 data->mode = io_translate_timeout_mode(flags); 5507 5540 hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); 5541 + io_req_track_inflight(req); 5508 5542 return 0; 5509 5543 } 5510 5544 ··· 5913 5945 static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) 5914 5946 { 5915 5947 struct io_ring_ctx *ctx = req->ctx; 5948 + const struct cred *creds = NULL; 5916 5949 int ret; 5950 + 5951 + if (req->work.personality) { 5952 + const struct cred *new_creds; 5953 + 5954 + if (!(issue_flags & IO_URING_F_NONBLOCK)) 5955 + mutex_lock(&ctx->uring_lock); 5956 + new_creds = idr_find(&ctx->personality_idr, req->work.personality); 5957 + if (!(issue_flags & IO_URING_F_NONBLOCK)) 5958 + mutex_unlock(&ctx->uring_lock); 5959 + if (!new_creds) 5960 + return -EINVAL; 5961 + creds = override_creds(new_creds); 5962 + } 5917 5963 5918 5964 switch (req->opcode) { 5919 5965 case IORING_OP_NOP: ··· 6034 6052 ret = -EINVAL; 6035 6053 break; 6036 6054 } 6055 + 6056 + if (creds) 6057 + revert_creds(creds); 6037 6058 6038 6059 if (ret) 6039 6060 return ret; ··· 6201 6216 static void __io_queue_sqe(struct io_kiocb *req) 6202 6217 { 6203 6218 struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); 6204 - const struct cred *old_creds = NULL; 6205 6219 int ret; 6206 6220 6207 - if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && 6208 - req->work.creds != current_cred()) 6209 - old_creds = override_creds(req->work.creds); 6210 - 6211 6221 ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); 6212 - 6213 - if (old_creds) 6214 - revert_creds(old_creds); 6215 6222 6216 6223 /* 6217 6224 * We async punt it if the file wasn't marked NOWAIT, or if the file ··· 6291 6314 { 6292 6315 struct io_submit_state *state; 6293 6316 unsigned int sqe_flags; 6294 - int id, ret = 0; 6317 + int ret = 0; 6295 6318 6296 6319 req->opcode = READ_ONCE(sqe->opcode); 6297 6320 /* same numerical values with corresponding REQ_F_*, safe to copy */ ··· 6323 6346 !io_op_defs[req->opcode].buffer_select) 6324 6347 return -EOPNOTSUPP; 6325 6348 6326 - id = READ_ONCE(sqe->personality); 6327 - if (id) { 6328 - __io_req_init_async(req); 6329 - req->work.creds = idr_find(&ctx->personality_idr, id); 6330 - if (unlikely(!req->work.creds)) 6331 - return -EINVAL; 6332 - get_cred(req->work.creds); 6333 - } 6334 - 6349 + req->work.list.next = NULL; 6350 + req->work.flags = 0; 6351 + req->work.personality = READ_ONCE(sqe->personality); 6335 6352 state = &ctx->submit_state; 6336 6353 6337 6354 /* ··· 6587 6616 if (!list_empty(&ctx->iopoll_list)) 6588 6617 io_do_iopoll(ctx, &nr_events, 0); 6589 6618 6590 - if (to_submit && !ctx->sqo_dead && 6591 - likely(!percpu_ref_is_dying(&ctx->refs))) 6619 + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs))) 6592 6620 ret = io_submit_sqes(ctx, to_submit); 6593 6621 mutex_unlock(&ctx->uring_lock); 6594 6622 } ··· 6656 6686 * wait_task_inactive(). 6657 6687 */ 6658 6688 preempt_disable(); 6659 - complete(&sqd->completion); 6689 + complete(&sqd->parked); 6660 6690 schedule_preempt_disabled(); 6661 6691 preempt_enable(); 6662 6692 } ··· 6673 6703 6674 6704 sprintf(buf, "iou-sqp-%d", sqd->task_pid); 6675 6705 set_task_comm(current, buf); 6676 - sqd->thread = current; 6677 6706 current->pf_io_worker = NULL; 6678 6707 6679 6708 if (sqd->sq_cpu != -1) ··· 6680 6711 else 6681 6712 set_cpus_allowed_ptr(current, cpu_online_mask); 6682 6713 current->flags |= PF_NO_SETAFFINITY; 6683 - 6684 - complete(&sqd->completion); 6685 6714 6686 6715 wait_for_completion(&sqd->startup); 6687 6716 ··· 6737 6770 io_ring_set_wakeup_flag(ctx); 6738 6771 6739 6772 schedule(); 6773 + try_to_freeze(); 6740 6774 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) 6741 6775 io_ring_clear_wakeup_flag(ctx); 6742 6776 } ··· 6752 6784 io_run_task_work(); 6753 6785 6754 6786 /* 6755 - * Clear thread under lock so that concurrent parks work correctly 6787 + * Ensure that we park properly if racing with someone trying to park 6788 + * while we're exiting. If we fail to grab the lock, check park and 6789 + * park if necessary. The ordering with the park bit and the lock 6790 + * ensures that we catch this reliably. 6756 6791 */ 6757 - complete_all(&sqd->completion); 6758 - mutex_lock(&sqd->lock); 6792 + if (!mutex_trylock(&sqd->lock)) { 6793 + if (io_sq_thread_should_park(sqd)) 6794 + io_sq_thread_parkme(sqd); 6795 + mutex_lock(&sqd->lock); 6796 + } 6797 + 6759 6798 sqd->thread = NULL; 6760 6799 list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { 6761 6800 ctx->sqo_exec = 1; 6762 6801 io_ring_set_wakeup_flag(ctx); 6763 6802 } 6764 - mutex_unlock(&sqd->lock); 6765 6803 6766 6804 complete(&sqd->exited); 6805 + mutex_unlock(&sqd->lock); 6767 6806 do_exit(0); 6768 6807 } 6769 6808 ··· 6892 6917 iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); 6893 6918 trace_io_uring_cqring_wait(ctx, min_events); 6894 6919 do { 6895 - io_cqring_overflow_flush(ctx, false, NULL, NULL); 6920 + /* if we can't even flush overflow, don't wait for more */ 6921 + if (!io_cqring_overflow_flush(ctx, false, NULL, NULL)) { 6922 + ret = -EBUSY; 6923 + break; 6924 + } 6896 6925 prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, 6897 6926 TASK_INTERRUPTIBLE); 6898 6927 ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); 6899 6928 finish_wait(&ctx->wait, &iowq.wq); 6929 + cond_resched(); 6900 6930 } while (ret > 0); 6901 6931 6902 6932 restore_saved_sigmask_unless(ret == -EINTR); ··· 7071 7091 static void io_sq_thread_unpark(struct io_sq_data *sqd) 7072 7092 __releases(&sqd->lock) 7073 7093 { 7074 - if (!sqd->thread) 7075 - return; 7076 7094 if (sqd->thread == current) 7077 7095 return; 7078 7096 clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7079 - wake_up_state(sqd->thread, TASK_PARKED); 7097 + if (sqd->thread) 7098 + wake_up_state(sqd->thread, TASK_PARKED); 7080 7099 mutex_unlock(&sqd->lock); 7081 7100 } 7082 7101 7083 - static bool io_sq_thread_park(struct io_sq_data *sqd) 7102 + static void io_sq_thread_park(struct io_sq_data *sqd) 7084 7103 __acquires(&sqd->lock) 7085 7104 { 7086 7105 if (sqd->thread == current) 7087 - return true; 7088 - mutex_lock(&sqd->lock); 7089 - if (!sqd->thread) { 7090 - mutex_unlock(&sqd->lock); 7091 - return false; 7092 - } 7106 + return; 7093 7107 set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); 7094 - wake_up_process(sqd->thread); 7095 - wait_for_completion(&sqd->completion); 7096 - return true; 7108 + mutex_lock(&sqd->lock); 7109 + if (sqd->thread) { 7110 + wake_up_process(sqd->thread); 7111 + wait_for_completion(&sqd->parked); 7112 + } 7097 7113 } 7098 7114 7099 7115 static void io_sq_thread_stop(struct io_sq_data *sqd) 7100 7116 { 7101 - if (!sqd->thread) 7117 + if (test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)) 7102 7118 return; 7103 - 7104 - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7105 - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); 7106 - wake_up_process(sqd->thread); 7107 - wait_for_completion(&sqd->exited); 7119 + mutex_lock(&sqd->lock); 7120 + if (sqd->thread) { 7121 + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7122 + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); 7123 + wake_up_process(sqd->thread); 7124 + mutex_unlock(&sqd->lock); 7125 + wait_for_completion(&sqd->exited); 7126 + WARN_ON_ONCE(sqd->thread); 7127 + } else { 7128 + mutex_unlock(&sqd->lock); 7129 + } 7108 7130 } 7109 7131 7110 7132 static void io_put_sq_data(struct io_sq_data *sqd) ··· 7185 7203 mutex_init(&sqd->lock); 7186 7204 init_waitqueue_head(&sqd->wait); 7187 7205 init_completion(&sqd->startup); 7188 - init_completion(&sqd->completion); 7206 + init_completion(&sqd->parked); 7189 7207 init_completion(&sqd->exited); 7190 7208 return sqd; 7191 7209 } ··· 7816 7834 struct io_uring_task *tctx = tsk->io_uring; 7817 7835 7818 7836 WARN_ON_ONCE(!xa_empty(&tctx->xa)); 7837 + WARN_ON_ONCE(tctx->io_wq); 7838 + 7819 7839 percpu_counter_destroy(&tctx->inflight); 7820 7840 kfree(tctx); 7821 7841 tsk->io_uring = NULL; ··· 7825 7841 7826 7842 static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) 7827 7843 { 7844 + struct task_struct *tsk; 7828 7845 int ret; 7829 7846 7830 7847 clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7831 - reinit_completion(&sqd->completion); 7832 - ctx->sqo_dead = ctx->sqo_exec = 0; 7848 + reinit_completion(&sqd->parked); 7849 + ctx->sqo_exec = 0; 7833 7850 sqd->task_pid = current->pid; 7834 - current->flags |= PF_IO_WORKER; 7835 - ret = io_wq_fork_thread(io_sq_thread, sqd); 7836 - current->flags &= ~PF_IO_WORKER; 7837 - if (ret < 0) { 7838 - sqd->thread = NULL; 7839 - return ret; 7840 - } 7841 - wait_for_completion(&sqd->completion); 7842 - return io_uring_alloc_task_context(sqd->thread, ctx); 7851 + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 7852 + if (IS_ERR(tsk)) 7853 + return PTR_ERR(tsk); 7854 + ret = io_uring_alloc_task_context(tsk, ctx); 7855 + if (ret) 7856 + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7857 + sqd->thread = tsk; 7858 + wake_up_new_task(tsk); 7859 + return ret; 7843 7860 } 7844 7861 7845 7862 static int io_sq_offload_create(struct io_ring_ctx *ctx, ··· 7863 7878 fdput(f); 7864 7879 } 7865 7880 if (ctx->flags & IORING_SETUP_SQPOLL) { 7881 + struct task_struct *tsk; 7866 7882 struct io_sq_data *sqd; 7867 7883 7868 7884 ret = -EPERM; ··· 7905 7919 } 7906 7920 7907 7921 sqd->task_pid = current->pid; 7908 - current->flags |= PF_IO_WORKER; 7909 - ret = io_wq_fork_thread(io_sq_thread, sqd); 7910 - current->flags &= ~PF_IO_WORKER; 7911 - if (ret < 0) { 7912 - sqd->thread = NULL; 7922 + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); 7923 + if (IS_ERR(tsk)) { 7924 + ret = PTR_ERR(tsk); 7913 7925 goto err; 7914 7926 } 7915 - wait_for_completion(&sqd->completion); 7916 - ret = io_uring_alloc_task_context(sqd->thread, ctx); 7927 + ret = io_uring_alloc_task_context(tsk, ctx); 7928 + if (ret) 7929 + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); 7930 + sqd->thread = tsk; 7931 + wake_up_new_task(tsk); 7917 7932 if (ret) 7918 7933 goto err; 7919 7934 } else if (p->flags & IORING_SETUP_SQ_AFF) { ··· 7933 7946 { 7934 7947 struct io_sq_data *sqd = ctx->sq_data; 7935 7948 7949 + ctx->flags &= ~IORING_SETUP_R_DISABLED; 7936 7950 if (ctx->flags & IORING_SETUP_SQPOLL) 7937 7951 complete(&sqd->startup); 7938 7952 } ··· 8372 8384 } 8373 8385 } 8374 8386 8375 - static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk) 8387 + static void io_req_caches_free(struct io_ring_ctx *ctx) 8376 8388 { 8377 8389 struct io_submit_state *submit_state = &ctx->submit_state; 8378 8390 struct io_comp_state *cs = &ctx->submit_state.comp; ··· 8432 8444 8433 8445 percpu_ref_exit(&ctx->refs); 8434 8446 free_uid(ctx->user); 8435 - io_req_caches_free(ctx, NULL); 8447 + io_req_caches_free(ctx); 8436 8448 if (ctx->hash_map) 8437 8449 io_wq_put_hash(ctx->hash_map); 8438 8450 kfree(ctx->cancel_hash); ··· 8500 8512 return 0; 8501 8513 } 8502 8514 8503 - static void io_run_ctx_fallback(struct io_ring_ctx *ctx) 8515 + static bool io_run_ctx_fallback(struct io_ring_ctx *ctx) 8504 8516 { 8505 - struct callback_head *work, *head, *next; 8517 + struct callback_head *work, *next; 8518 + bool executed = false; 8506 8519 8507 8520 do { 8508 - do { 8509 - head = NULL; 8510 - work = READ_ONCE(ctx->exit_task_work); 8511 - } while (cmpxchg(&ctx->exit_task_work, work, head) != work); 8512 - 8521 + work = xchg(&ctx->exit_task_work, NULL); 8513 8522 if (!work) 8514 8523 break; 8515 8524 ··· 8516 8531 work = next; 8517 8532 cond_resched(); 8518 8533 } while (work); 8534 + executed = true; 8519 8535 } while (1); 8536 + 8537 + return executed; 8520 8538 } 8521 8539 8522 8540 static void io_ring_exit_work(struct work_struct *work) ··· 8535 8547 */ 8536 8548 do { 8537 8549 io_uring_try_cancel_requests(ctx, NULL, NULL); 8538 - io_run_ctx_fallback(ctx); 8539 8550 } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); 8540 8551 io_ring_ctx_free(ctx); 8541 8552 } ··· 8543 8556 { 8544 8557 mutex_lock(&ctx->uring_lock); 8545 8558 percpu_ref_kill(&ctx->refs); 8546 - 8547 - if (WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && !ctx->sqo_dead)) 8548 - ctx->sqo_dead = 1; 8549 - 8550 8559 /* if force is set, the ring is going away. always drop after that */ 8551 8560 ctx->cq_overflow_flushed = 1; 8552 8561 if (ctx->rings) ··· 8631 8648 struct files_struct *files) 8632 8649 { 8633 8650 struct io_task_cancel cancel = { .task = task, .files = files, }; 8634 - struct io_uring_task *tctx = current->io_uring; 8651 + struct task_struct *tctx_task = task ?: current; 8652 + struct io_uring_task *tctx = tctx_task->io_uring; 8635 8653 8636 8654 while (1) { 8637 8655 enum io_wq_cancel cret; ··· 8655 8671 ret |= io_poll_remove_all(ctx, task, files); 8656 8672 ret |= io_kill_timeouts(ctx, task, files); 8657 8673 ret |= io_run_task_work(); 8674 + ret |= io_run_ctx_fallback(ctx); 8658 8675 io_cqring_overflow_flush(ctx, true, task, files); 8659 8676 if (!ret) 8660 8677 break; ··· 8703 8718 } 8704 8719 } 8705 8720 8706 - static void io_disable_sqo_submit(struct io_ring_ctx *ctx) 8707 - { 8708 - mutex_lock(&ctx->uring_lock); 8709 - ctx->sqo_dead = 1; 8710 - mutex_unlock(&ctx->uring_lock); 8711 - 8712 - /* make sure callers enter the ring to get error */ 8713 - if (ctx->rings) 8714 - io_ring_set_wakeup_flag(ctx); 8715 - } 8716 - 8717 8721 /* 8718 8722 * We need to iteratively cancel requests, in case a request has dependent 8719 8723 * hard links. These persist even for failure of cancelations, hence keep ··· 8712 8738 struct files_struct *files) 8713 8739 { 8714 8740 struct task_struct *task = current; 8715 - bool did_park = false; 8716 8741 8717 8742 if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { 8718 - io_disable_sqo_submit(ctx); 8719 - did_park = io_sq_thread_park(ctx->sq_data); 8720 - if (did_park) { 8721 - task = ctx->sq_data->thread; 8722 - atomic_inc(&task->io_uring->in_idle); 8743 + /* never started, nothing to cancel */ 8744 + if (ctx->flags & IORING_SETUP_R_DISABLED) { 8745 + io_sq_offload_start(ctx); 8746 + return; 8723 8747 } 8748 + io_sq_thread_park(ctx->sq_data); 8749 + task = ctx->sq_data->thread; 8750 + if (task) 8751 + atomic_inc(&task->io_uring->in_idle); 8724 8752 } 8725 8753 8726 8754 io_cancel_defer_files(ctx, task, files); ··· 8731 8755 if (!files) 8732 8756 io_uring_try_cancel_requests(ctx, task, NULL); 8733 8757 8734 - if (did_park) { 8758 + if (task) 8735 8759 atomic_dec(&task->io_uring->in_idle); 8760 + if (ctx->sq_data) 8736 8761 io_sq_thread_unpark(ctx->sq_data); 8737 - } 8738 8762 } 8739 8763 8740 8764 /* ··· 8762 8786 fput(file); 8763 8787 return ret; 8764 8788 } 8765 - 8766 - /* one and only SQPOLL file note, held by sqo_task */ 8767 - WARN_ON_ONCE((ctx->flags & IORING_SETUP_SQPOLL) && 8768 - current != ctx->sqo_task); 8769 8789 } 8770 8790 tctx->last = file; 8771 8791 } ··· 8791 8819 fput(file); 8792 8820 } 8793 8821 8794 - static void io_uring_remove_task_files(struct io_uring_task *tctx) 8822 + static void io_uring_clean_tctx(struct io_uring_task *tctx) 8795 8823 { 8796 8824 struct file *file; 8797 8825 unsigned long index; 8798 8826 8799 8827 xa_for_each(&tctx->xa, index, file) 8800 8828 io_uring_del_task_file(file); 8829 + if (tctx->io_wq) { 8830 + io_wq_put_and_exit(tctx->io_wq); 8831 + tctx->io_wq = NULL; 8832 + } 8801 8833 } 8802 8834 8803 8835 void __io_uring_files_cancel(struct files_struct *files) ··· 8816 8840 io_uring_cancel_task_requests(file->private_data, files); 8817 8841 atomic_dec(&tctx->in_idle); 8818 8842 8819 - if (files) { 8820 - io_uring_remove_task_files(tctx); 8821 - if (tctx->io_wq) { 8822 - io_wq_put(tctx->io_wq); 8823 - tctx->io_wq = NULL; 8824 - } 8825 - } 8843 + if (files) 8844 + io_uring_clean_tctx(tctx); 8826 8845 } 8827 8846 8828 8847 static s64 tctx_inflight(struct io_uring_task *tctx) ··· 8834 8863 8835 8864 if (!sqd) 8836 8865 return; 8837 - io_disable_sqo_submit(ctx); 8838 - if (!io_sq_thread_park(sqd)) 8866 + io_sq_thread_park(sqd); 8867 + if (!sqd->thread || !sqd->thread->io_uring) { 8868 + io_sq_thread_unpark(sqd); 8839 8869 return; 8870 + } 8840 8871 tctx = ctx->sq_data->thread->io_uring; 8841 - 8842 8872 atomic_inc(&tctx->in_idle); 8843 8873 do { 8844 8874 /* read completions before cancelations */ ··· 8875 8903 /* make sure overflow events are dropped */ 8876 8904 atomic_inc(&tctx->in_idle); 8877 8905 8878 - /* trigger io_disable_sqo_submit() */ 8879 8906 if (tctx->sqpoll) { 8880 8907 struct file *file; 8881 8908 unsigned long index; ··· 8904 8933 8905 8934 atomic_dec(&tctx->in_idle); 8906 8935 8907 - io_uring_remove_task_files(tctx); 8908 - } 8909 - 8910 - static int io_uring_flush(struct file *file, void *data) 8911 - { 8912 - struct io_uring_task *tctx = current->io_uring; 8913 - struct io_ring_ctx *ctx = file->private_data; 8914 - 8915 - /* Ignore helper thread files exit */ 8916 - if (current->flags & PF_IO_WORKER) 8917 - return 0; 8918 - 8919 - if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { 8920 - io_uring_cancel_task_requests(ctx, NULL); 8921 - io_req_caches_free(ctx, current); 8922 - } 8923 - 8924 - io_run_ctx_fallback(ctx); 8925 - 8926 - if (!tctx) 8927 - return 0; 8928 - 8929 - /* we should have cancelled and erased it before PF_EXITING */ 8930 - WARN_ON_ONCE((current->flags & PF_EXITING) && 8931 - xa_load(&tctx->xa, (unsigned long)file)); 8932 - 8933 - /* 8934 - * fput() is pending, will be 2 if the only other ref is our potential 8935 - * task file note. If the task is exiting, drop regardless of count. 8936 - */ 8937 - if (atomic_long_read(&file->f_count) != 2) 8938 - return 0; 8939 - 8940 - if (ctx->flags & IORING_SETUP_SQPOLL) { 8941 - /* there is only one file note, which is owned by sqo_task */ 8942 - WARN_ON_ONCE(ctx->sqo_task != current && 8943 - xa_load(&tctx->xa, (unsigned long)file)); 8944 - /* sqo_dead check is for when this happens after cancellation */ 8945 - WARN_ON_ONCE(ctx->sqo_task == current && !ctx->sqo_dead && 8946 - !xa_load(&tctx->xa, (unsigned long)file)); 8947 - 8948 - io_disable_sqo_submit(ctx); 8949 - } 8950 - 8951 - if (!(ctx->flags & IORING_SETUP_SQPOLL) || ctx->sqo_task == current) 8952 - io_uring_del_task_file(file); 8953 - return 0; 8936 + io_uring_clean_tctx(tctx); 8937 + /* all current's requests should be gone, we can kill tctx */ 8938 + __io_uring_free(current); 8954 8939 } 8955 8940 8956 8941 static void *io_uring_validate_mmap_request(struct file *file, ··· 8987 9060 do { 8988 9061 if (!io_sqring_full(ctx)) 8989 9062 break; 8990 - 8991 9063 prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); 8992 - 8993 - if (unlikely(ctx->sqo_dead)) { 8994 - ret = -EOWNERDEAD; 8995 - goto out; 8996 - } 8997 9064 8998 9065 if (!io_sqring_full(ctx)) 8999 9066 break; 9000 - 9001 9067 schedule(); 9002 9068 } while (!signal_pending(current)); 9003 9069 9004 9070 finish_wait(&ctx->sqo_sq_wait, &wait); 9005 - out: 9006 9071 return ret; 9007 9072 } 9008 9073 ··· 9076 9157 ctx->sqo_exec = 0; 9077 9158 } 9078 9159 ret = -EOWNERDEAD; 9079 - if (unlikely(ctx->sqo_dead)) 9080 - goto out; 9081 9160 if (flags & IORING_ENTER_SQ_WAKEUP) 9082 9161 wake_up(&ctx->sq_data->wait); 9083 9162 if (flags & IORING_ENTER_SQ_WAIT) { ··· 9230 9313 9231 9314 static const struct file_operations io_uring_fops = { 9232 9315 .release = io_uring_release, 9233 - .flush = io_uring_flush, 9234 9316 .mmap = io_uring_mmap, 9235 9317 #ifndef CONFIG_MMU 9236 9318 .get_unmapped_area = io_uring_nommu_get_unmapped_area, ··· 9384 9468 ctx->compat = in_compat_syscall(); 9385 9469 if (!capable(CAP_IPC_LOCK)) 9386 9470 ctx->user = get_uid(current_user()); 9387 - ctx->sqo_task = current; 9388 9471 9389 9472 /* 9390 9473 * This is just grabbed for accounting purposes. When a process exits, ··· 9446 9531 */ 9447 9532 ret = io_uring_install_fd(ctx, file); 9448 9533 if (ret < 0) { 9449 - io_disable_sqo_submit(ctx); 9450 9534 /* fput will clean it up */ 9451 9535 fput(file); 9452 9536 return ret; ··· 9454 9540 trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); 9455 9541 return ret; 9456 9542 err: 9457 - io_disable_sqo_submit(ctx); 9458 9543 io_ring_ctx_wait_and_kill(ctx); 9459 9544 return ret; 9460 9545 } ··· 9621 9708 if (ctx->restrictions.registered) 9622 9709 ctx->restricted = 1; 9623 9710 9624 - ctx->flags &= ~IORING_SETUP_R_DISABLED; 9625 - 9626 9711 io_sq_offload_start(ctx); 9627 - 9628 9712 return 0; 9629 9713 } 9630 9714
+1 -1
include/linux/io_uring.h
··· 38 38 39 39 static inline void io_uring_task_cancel(void) 40 40 { 41 - if (current->io_uring && !xa_empty(&current->io_uring->xa)) 41 + if (current->io_uring) 42 42 __io_uring_task_cancel(); 43 43 } 44 44 static inline void io_uring_files_cancel(struct files_struct *files)
+2
include/linux/sched/task.h
··· 31 31 /* Number of elements in *set_tid */ 32 32 size_t set_tid_size; 33 33 int cgroup; 34 + int io_thread; 34 35 struct cgroup *cgrp; 35 36 struct css_set *cset; 36 37 }; ··· 83 82 extern void exit_itimers(struct signal_struct *); 84 83 85 84 extern pid_t kernel_clone(struct kernel_clone_args *kargs); 85 + struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); 86 86 struct task_struct *fork_idle(int); 87 87 struct mm_struct *copy_init_mm(void); 88 88 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
+30
kernel/fork.c
··· 1940 1940 p = dup_task_struct(current, node); 1941 1941 if (!p) 1942 1942 goto fork_out; 1943 + if (args->io_thread) 1944 + p->flags |= PF_IO_WORKER; 1943 1945 1944 1946 /* 1945 1947 * This _must_ happen before we call free_task(), i.e. before we jump ··· 2410 2408 struct mm_struct *copy_init_mm(void) 2411 2409 { 2412 2410 return dup_mm(NULL, &init_mm); 2411 + } 2412 + 2413 + /* 2414 + * This is like kernel_clone(), but shaved down and tailored to just 2415 + * creating io_uring workers. It returns a created task, or an error pointer. 2416 + * The returned task is inactive, and the caller must fire it up through 2417 + * wake_up_new_task(p). All signals are blocked in the created task. 2418 + */ 2419 + struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) 2420 + { 2421 + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| 2422 + CLONE_IO; 2423 + struct kernel_clone_args args = { 2424 + .flags = ((lower_32_bits(flags) | CLONE_VM | 2425 + CLONE_UNTRACED) & ~CSIGNAL), 2426 + .exit_signal = (lower_32_bits(flags) & CSIGNAL), 2427 + .stack = (unsigned long)fn, 2428 + .stack_size = (unsigned long)arg, 2429 + .io_thread = 1, 2430 + }; 2431 + struct task_struct *tsk; 2432 + 2433 + tsk = copy_process(NULL, 0, node, &args); 2434 + if (!IS_ERR(tsk)) { 2435 + sigfillset(&tsk->blocked); 2436 + sigdelsetmask(&tsk->blocked, sigmask(SIGKILL)); 2437 + } 2438 + return tsk; 2413 2439 } 2414 2440 2415 2441 /*