Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-5.8-2020-06-19' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

- Catch a case where io_sq_thread() didn't do proper mm acquire

- Ensure poll completions are reaped on shutdown

- Async cancelation and run fixes (Pavel)

- io-poll race fixes (Xiaoguang)

- Request cleanup race fix (Xiaoguang)

* tag 'io_uring-5.8-2020-06-19' of git://git.kernel.dk/linux-block:
io_uring: fix possible race condition against REQ_F_NEED_CLEANUP
io_uring: reap poll completions while waiting for refs to drop on exit
io_uring: acquire 'mm' for task_work for SQPOLL
io_uring: add memory barrier to synchronize io_kiocb's result and iopoll_completed
io_uring: don't fail links for EAGAIN error in IOPOLL mode
io_uring: cancel by ->task not pid
io_uring: lazy get task
io_uring: batch cancel in io_uring_cancel_files()
io_uring: cancel all task's requests on exit
io-wq: add an option to cancel all matched reqs
io-wq: reorder cancellation pending -> running
io_uring: fix lazy work init

+180 -115
+59 -55
fs/io-wq.c
··· 903 903 struct io_cb_cancel_data { 904 904 work_cancel_fn *fn; 905 905 void *data; 906 + int nr_running; 907 + int nr_pending; 908 + bool cancel_all; 906 909 }; 907 910 908 911 static bool io_wq_worker_cancel(struct io_worker *worker, void *data) 909 912 { 910 913 struct io_cb_cancel_data *match = data; 911 914 unsigned long flags; 912 - bool ret = false; 913 915 914 916 /* 915 917 * Hold the lock to avoid ->cur_work going out of scope, caller ··· 922 920 !(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) && 923 921 match->fn(worker->cur_work, match->data)) { 924 922 send_sig(SIGINT, worker->task, 1); 925 - ret = true; 923 + match->nr_running++; 926 924 } 927 925 spin_unlock_irqrestore(&worker->lock, flags); 928 926 929 - return ret; 927 + return match->nr_running && !match->cancel_all; 930 928 } 931 929 932 - static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, 933 - struct io_cb_cancel_data *match) 930 + static void io_wqe_cancel_pending_work(struct io_wqe *wqe, 931 + struct io_cb_cancel_data *match) 934 932 { 935 933 struct io_wq_work_node *node, *prev; 936 934 struct io_wq_work *work; 937 935 unsigned long flags; 938 - bool found = false; 936 + 937 + retry: 938 + spin_lock_irqsave(&wqe->lock, flags); 939 + wq_list_for_each(node, prev, &wqe->work_list) { 940 + work = container_of(node, struct io_wq_work, list); 941 + if (!match->fn(work, match->data)) 942 + continue; 943 + 944 + wq_list_del(&wqe->work_list, node, prev); 945 + spin_unlock_irqrestore(&wqe->lock, flags); 946 + io_run_cancel(work, wqe); 947 + match->nr_pending++; 948 + if (!match->cancel_all) 949 + return; 950 + 951 + /* not safe to continue after unlock */ 952 + goto retry; 953 + } 954 + spin_unlock_irqrestore(&wqe->lock, flags); 955 + } 956 + 957 + static void io_wqe_cancel_running_work(struct io_wqe *wqe, 958 + struct io_cb_cancel_data *match) 959 + { 960 + rcu_read_lock(); 961 + io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); 962 + rcu_read_unlock(); 963 + } 964 + 965 + enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, 966 + void *data, bool cancel_all) 967 + { 968 + struct io_cb_cancel_data match = { 969 + .fn = cancel, 970 + .data = data, 971 + .cancel_all = cancel_all, 972 + }; 973 + int node; 939 974 940 975 /* 941 976 * First check pending list, if we're lucky we can just remove it 942 977 * from there. CANCEL_OK means that the work is returned as-new, 943 978 * no completion will be posted for it. 944 979 */ 945 - spin_lock_irqsave(&wqe->lock, flags); 946 - wq_list_for_each(node, prev, &wqe->work_list) { 947 - work = container_of(node, struct io_wq_work, list); 980 + for_each_node(node) { 981 + struct io_wqe *wqe = wq->wqes[node]; 948 982 949 - if (match->fn(work, match->data)) { 950 - wq_list_del(&wqe->work_list, node, prev); 951 - found = true; 952 - break; 953 - } 954 - } 955 - spin_unlock_irqrestore(&wqe->lock, flags); 956 - 957 - if (found) { 958 - io_run_cancel(work, wqe); 959 - return IO_WQ_CANCEL_OK; 983 + io_wqe_cancel_pending_work(wqe, &match); 984 + if (match.nr_pending && !match.cancel_all) 985 + return IO_WQ_CANCEL_OK; 960 986 } 961 987 962 988 /* ··· 993 963 * as an indication that we attempt to signal cancellation. The 994 964 * completion will run normally in this case. 995 965 */ 996 - rcu_read_lock(); 997 - found = io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); 998 - rcu_read_unlock(); 999 - return found ? IO_WQ_CANCEL_RUNNING : IO_WQ_CANCEL_NOTFOUND; 1000 - } 1001 - 1002 - enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, 1003 - void *data) 1004 - { 1005 - struct io_cb_cancel_data match = { 1006 - .fn = cancel, 1007 - .data = data, 1008 - }; 1009 - enum io_wq_cancel ret = IO_WQ_CANCEL_NOTFOUND; 1010 - int node; 1011 - 1012 966 for_each_node(node) { 1013 967 struct io_wqe *wqe = wq->wqes[node]; 1014 968 1015 - ret = io_wqe_cancel_work(wqe, &match); 1016 - if (ret != IO_WQ_CANCEL_NOTFOUND) 1017 - break; 969 + io_wqe_cancel_running_work(wqe, &match); 970 + if (match.nr_running && !match.cancel_all) 971 + return IO_WQ_CANCEL_RUNNING; 1018 972 } 1019 973 1020 - return ret; 974 + if (match.nr_running) 975 + return IO_WQ_CANCEL_RUNNING; 976 + if (match.nr_pending) 977 + return IO_WQ_CANCEL_OK; 978 + return IO_WQ_CANCEL_NOTFOUND; 1021 979 } 1022 980 1023 981 static bool io_wq_io_cb_cancel_data(struct io_wq_work *work, void *data) ··· 1015 997 1016 998 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork) 1017 999 { 1018 - return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork); 1019 - } 1020 - 1021 - static bool io_wq_pid_match(struct io_wq_work *work, void *data) 1022 - { 1023 - pid_t pid = (pid_t) (unsigned long) data; 1024 - 1025 - return work->task_pid == pid; 1026 - } 1027 - 1028 - enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid) 1029 - { 1030 - void *data = (void *) (unsigned long) pid; 1031 - 1032 - return io_wq_cancel_cb(wq, io_wq_pid_match, data); 1000 + return io_wq_cancel_cb(wq, io_wq_io_cb_cancel_data, (void *)cwork, false); 1033 1001 } 1034 1002 1035 1003 struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
+1 -3
fs/io-wq.h
··· 90 90 const struct cred *creds; 91 91 struct fs_struct *fs; 92 92 unsigned flags; 93 - pid_t task_pid; 94 93 }; 95 94 96 95 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) ··· 124 125 125 126 void io_wq_cancel_all(struct io_wq *wq); 126 127 enum io_wq_cancel io_wq_cancel_work(struct io_wq *wq, struct io_wq_work *cwork); 127 - enum io_wq_cancel io_wq_cancel_pid(struct io_wq *wq, pid_t pid); 128 128 129 129 typedef bool (work_cancel_fn)(struct io_wq_work *, void *); 130 130 131 131 enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, 132 - void *data); 132 + void *data, bool cancel_all); 133 133 134 134 struct task_struct *io_wq_get_task(struct io_wq *wq); 135 135
+120 -57
fs/io_uring.c
··· 541 541 REQ_F_NO_FILE_TABLE_BIT, 542 542 REQ_F_QUEUE_TIMEOUT_BIT, 543 543 REQ_F_WORK_INITIALIZED_BIT, 544 + REQ_F_TASK_PINNED_BIT, 544 545 545 546 /* not a real bit, just to check we're not overflowing the space */ 546 547 __REQ_F_LAST_BIT, ··· 599 598 REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), 600 599 /* io_wq_work is initialized */ 601 600 REQ_F_WORK_INITIALIZED = BIT(REQ_F_WORK_INITIALIZED_BIT), 601 + /* req->task is refcounted */ 602 + REQ_F_TASK_PINNED = BIT(REQ_F_TASK_PINNED_BIT), 602 603 }; 603 604 604 605 struct async_poll { ··· 913 910 } 914 911 EXPORT_SYMBOL(io_uring_get_socket); 915 912 913 + static void io_get_req_task(struct io_kiocb *req) 914 + { 915 + if (req->flags & REQ_F_TASK_PINNED) 916 + return; 917 + get_task_struct(req->task); 918 + req->flags |= REQ_F_TASK_PINNED; 919 + } 920 + 921 + /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */ 922 + static void __io_put_req_task(struct io_kiocb *req) 923 + { 924 + if (req->flags & REQ_F_TASK_PINNED) 925 + put_task_struct(req->task); 926 + } 927 + 916 928 static void io_file_put_work(struct work_struct *work); 917 929 918 930 /* ··· 1063 1045 } 1064 1046 spin_unlock(&current->fs->lock); 1065 1047 } 1066 - if (!req->work.task_pid) 1067 - req->work.task_pid = task_pid_vnr(current); 1068 1048 } 1069 1049 1070 1050 static inline void io_req_work_drop_env(struct io_kiocb *req) ··· 1103 1087 req->work.flags |= IO_WQ_WORK_UNBOUND; 1104 1088 } 1105 1089 1090 + io_req_init_async(req); 1106 1091 io_req_work_grab_env(req, def); 1107 1092 1108 1093 *link = io_prep_linked_timeout(req); ··· 1415 1398 kfree(req->io); 1416 1399 if (req->file) 1417 1400 io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE)); 1418 - if (req->task) 1419 - put_task_struct(req->task); 1420 - 1401 + __io_put_req_task(req); 1421 1402 io_req_work_drop_env(req); 1422 1403 } 1423 1404 ··· 1742 1727 return cflags; 1743 1728 } 1744 1729 1730 + static void io_iopoll_queue(struct list_head *again) 1731 + { 1732 + struct io_kiocb *req; 1733 + 1734 + do { 1735 + req = list_first_entry(again, struct io_kiocb, list); 1736 + list_del(&req->list); 1737 + refcount_inc(&req->refs); 1738 + io_queue_async_work(req); 1739 + } while (!list_empty(again)); 1740 + } 1741 + 1745 1742 /* 1746 1743 * Find and free completed poll iocbs 1747 1744 */ ··· 1762 1735 { 1763 1736 struct req_batch rb; 1764 1737 struct io_kiocb *req; 1738 + LIST_HEAD(again); 1739 + 1740 + /* order with ->result store in io_complete_rw_iopoll() */ 1741 + smp_rmb(); 1765 1742 1766 1743 rb.to_free = rb.need_iter = 0; 1767 1744 while (!list_empty(done)) { 1768 1745 int cflags = 0; 1769 1746 1770 1747 req = list_first_entry(done, struct io_kiocb, list); 1748 + if (READ_ONCE(req->result) == -EAGAIN) { 1749 + req->iopoll_completed = 0; 1750 + list_move_tail(&req->list, &again); 1751 + continue; 1752 + } 1771 1753 list_del(&req->list); 1772 1754 1773 1755 if (req->flags & REQ_F_BUFFER_SELECTED) ··· 1794 1758 if (ctx->flags & IORING_SETUP_SQPOLL) 1795 1759 io_cqring_ev_posted(ctx); 1796 1760 io_free_req_many(ctx, &rb); 1797 - } 1798 1761 1799 - static void io_iopoll_queue(struct list_head *again) 1800 - { 1801 - struct io_kiocb *req; 1802 - 1803 - do { 1804 - req = list_first_entry(again, struct io_kiocb, list); 1805 - list_del(&req->list); 1806 - refcount_inc(&req->refs); 1807 - io_queue_async_work(req); 1808 - } while (!list_empty(again)); 1762 + if (!list_empty(&again)) 1763 + io_iopoll_queue(&again); 1809 1764 } 1810 1765 1811 1766 static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, ··· 1804 1777 { 1805 1778 struct io_kiocb *req, *tmp; 1806 1779 LIST_HEAD(done); 1807 - LIST_HEAD(again); 1808 1780 bool spin; 1809 1781 int ret; 1810 1782 ··· 1829 1803 if (!list_empty(&done)) 1830 1804 break; 1831 1805 1832 - if (req->result == -EAGAIN) { 1833 - list_move_tail(&req->list, &again); 1834 - continue; 1835 - } 1836 - if (!list_empty(&again)) 1837 - break; 1838 - 1839 1806 ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); 1840 1807 if (ret < 0) 1841 1808 break; ··· 1840 1821 1841 1822 if (!list_empty(&done)) 1842 1823 io_iopoll_complete(ctx, nr_events, &done); 1843 - 1844 - if (!list_empty(&again)) 1845 - io_iopoll_queue(&again); 1846 1824 1847 1825 return ret; 1848 1826 } ··· 1989 1973 if (kiocb->ki_flags & IOCB_WRITE) 1990 1974 kiocb_end_write(req); 1991 1975 1992 - if (res != req->result) 1976 + if (res != -EAGAIN && res != req->result) 1993 1977 req_set_fail_links(req); 1994 - req->result = res; 1995 - if (res != -EAGAIN) 1978 + 1979 + WRITE_ONCE(req->result, res); 1980 + /* order with io_poll_complete() checking ->result */ 1981 + if (res != -EAGAIN) { 1982 + smp_wmb(); 1996 1983 WRITE_ONCE(req->iopoll_completed, 1); 1984 + } 1997 1985 } 1998 1986 1999 1987 /* ··· 2670 2650 } 2671 2651 } 2672 2652 out_free: 2673 - kfree(iovec); 2674 - req->flags &= ~REQ_F_NEED_CLEANUP; 2653 + if (!(req->flags & REQ_F_NEED_CLEANUP)) 2654 + kfree(iovec); 2675 2655 return ret; 2676 2656 } 2677 2657 ··· 2793 2773 } 2794 2774 } 2795 2775 out_free: 2796 - req->flags &= ~REQ_F_NEED_CLEANUP; 2797 - kfree(iovec); 2776 + if (!(req->flags & REQ_F_NEED_CLEANUP)) 2777 + kfree(iovec); 2798 2778 return ret; 2799 2779 } 2800 2780 ··· 4256 4236 __io_queue_proc(&pt->req->apoll->poll, pt, head); 4257 4237 } 4258 4238 4239 + static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) 4240 + { 4241 + struct mm_struct *mm = current->mm; 4242 + 4243 + if (mm) { 4244 + kthread_unuse_mm(mm); 4245 + mmput(mm); 4246 + } 4247 + } 4248 + 4249 + static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, 4250 + struct io_kiocb *req) 4251 + { 4252 + if (io_op_defs[req->opcode].needs_mm && !current->mm) { 4253 + if (unlikely(!mmget_not_zero(ctx->sqo_mm))) 4254 + return -EFAULT; 4255 + kthread_use_mm(ctx->sqo_mm); 4256 + } 4257 + 4258 + return 0; 4259 + } 4260 + 4259 4261 static void io_async_task_func(struct callback_head *cb) 4260 4262 { 4261 4263 struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); ··· 4312 4270 4313 4271 if (!canceled) { 4314 4272 __set_current_state(TASK_RUNNING); 4273 + if (io_sq_thread_acquire_mm(ctx, req)) { 4274 + io_cqring_add_event(req, -EFAULT); 4275 + goto end_req; 4276 + } 4315 4277 mutex_lock(&ctx->uring_lock); 4316 4278 __io_queue_sqe(req, NULL); 4317 4279 mutex_unlock(&ctx->uring_lock); 4318 4280 } else { 4319 4281 io_cqring_ev_posted(ctx); 4282 + end_req: 4320 4283 req_set_fail_links(req); 4321 4284 io_double_put_req(req); 4322 4285 } ··· 4413 4366 memcpy(&apoll->work, &req->work, sizeof(req->work)); 4414 4367 had_io = req->io != NULL; 4415 4368 4416 - get_task_struct(current); 4417 - req->task = current; 4369 + io_get_req_task(req); 4418 4370 req->apoll = apoll; 4419 4371 INIT_HLIST_NODE(&req->hash_node); 4420 4372 ··· 4601 4555 events = READ_ONCE(sqe->poll_events); 4602 4556 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 4603 4557 4604 - get_task_struct(current); 4605 - req->task = current; 4558 + io_get_req_task(req); 4606 4559 return 0; 4607 4560 } 4608 4561 ··· 4817 4772 enum io_wq_cancel cancel_ret; 4818 4773 int ret = 0; 4819 4774 4820 - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr); 4775 + cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); 4821 4776 switch (cancel_ret) { 4822 4777 case IO_WQ_CANCEL_OK: 4823 4778 ret = 0; ··· 5862 5817 req->flags = 0; 5863 5818 /* one is dropped after submission, the other at completion */ 5864 5819 refcount_set(&req->refs, 2); 5865 - req->task = NULL; 5820 + req->task = current; 5866 5821 req->result = 0; 5867 5822 5868 5823 if (unlikely(req->opcode >= IORING_OP_LAST)) 5869 5824 return -EINVAL; 5870 5825 5871 - if (io_op_defs[req->opcode].needs_mm && !current->mm) { 5872 - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) 5873 - return -EFAULT; 5874 - kthread_use_mm(ctx->sqo_mm); 5875 - } 5826 + if (unlikely(io_sq_thread_acquire_mm(ctx, req))) 5827 + return -EFAULT; 5876 5828 5877 5829 sqe_flags = READ_ONCE(sqe->flags); 5878 5830 /* enforce forwards compatibility on users */ ··· 5976 5934 io_commit_sqring(ctx); 5977 5935 5978 5936 return submitted; 5979 - } 5980 - 5981 - static inline void io_sq_thread_drop_mm(struct io_ring_ctx *ctx) 5982 - { 5983 - struct mm_struct *mm = current->mm; 5984 - 5985 - if (mm) { 5986 - kthread_unuse_mm(mm); 5987 - mmput(mm); 5988 - } 5989 5937 } 5990 5938 5991 5939 static int io_sq_thread(void *data) ··· 7363 7331 if (ctx->rings) 7364 7332 io_cqring_overflow_flush(ctx, true); 7365 7333 7366 - wait_for_completion(&ctx->ref_comp); 7334 + /* 7335 + * If we're doing polled IO and end up having requests being 7336 + * submitted async (out-of-line), then completions can come in while 7337 + * we're waiting for refs to drop. We need to reap these manually, 7338 + * as nobody else will be looking for them. 7339 + */ 7340 + while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) { 7341 + io_iopoll_reap_events(ctx); 7342 + if (ctx->rings) 7343 + io_cqring_overflow_flush(ctx, true); 7344 + } 7367 7345 io_ring_ctx_free(ctx); 7368 7346 } 7369 7347 ··· 7407 7365 return 0; 7408 7366 } 7409 7367 7368 + static bool io_wq_files_match(struct io_wq_work *work, void *data) 7369 + { 7370 + struct files_struct *files = data; 7371 + 7372 + return work->files == files; 7373 + } 7374 + 7410 7375 static void io_uring_cancel_files(struct io_ring_ctx *ctx, 7411 7376 struct files_struct *files) 7412 7377 { 7378 + if (list_empty_careful(&ctx->inflight_list)) 7379 + return; 7380 + 7381 + /* cancel all at once, should be faster than doing it one by one*/ 7382 + io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); 7383 + 7413 7384 while (!list_empty_careful(&ctx->inflight_list)) { 7414 7385 struct io_kiocb *cancel_req = NULL, *req; 7415 7386 DEFINE_WAIT(wait); ··· 7478 7423 } 7479 7424 } 7480 7425 7426 + static bool io_cancel_task_cb(struct io_wq_work *work, void *data) 7427 + { 7428 + struct io_kiocb *req = container_of(work, struct io_kiocb, work); 7429 + struct task_struct *task = data; 7430 + 7431 + return req->task == task; 7432 + } 7433 + 7481 7434 static int io_uring_flush(struct file *file, void *data) 7482 7435 { 7483 7436 struct io_ring_ctx *ctx = file->private_data; ··· 7496 7433 * If the task is going away, cancel work it may have pending 7497 7434 */ 7498 7435 if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) 7499 - io_wq_cancel_pid(ctx->io_wq, task_pid_vnr(current)); 7436 + io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, current, true); 7500 7437 7501 7438 return 0; 7502 7439 }