Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-5.6-2020-02-28' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

- Fix for a race with IOPOLL used with SQPOLL (Xiaoguang)

- Only show ->fdinfo if procfs is enabled (Tobias)

- Fix for a chain with multiple personalities in the SQEs

- Fix for a missing free of personality idr on exit

- Removal of the spin-for-work optimization

- Fix for next work lookup on request completion

- Fix for non-vec read/write result progation in case of links

- Fix for a fileset references on switch

- Fix for a recvmsg/sendmsg 32-bit compatability mode

* tag 'io_uring-5.6-2020-02-28' of git://git.kernel.dk/linux-block:
io_uring: fix 32-bit compatability with sendmsg/recvmsg
io_uring: define and set show_fdinfo only if procfs is enabled
io_uring: drop file set ref put/get on switch
io_uring: import_single_range() returns 0/-ERROR
io_uring: pick up link work on submit reference drop
io-wq: ensure work->task_pid is cleared on init
io-wq: remove spin-for-work optimization
io_uring: fix poll_list race for SETUP_IOPOLL|SETUP_SQPOLL
io_uring: fix personality idr leak
io_uring: handle multiple personalities in link chains

+74 -91
-19
fs/io-wq.c
··· 535 535 } while (1); 536 536 } 537 537 538 - static inline void io_worker_spin_for_work(struct io_wqe *wqe) 539 - { 540 - int i = 0; 541 - 542 - while (++i < 1000) { 543 - if (io_wqe_run_queue(wqe)) 544 - break; 545 - if (need_resched()) 546 - break; 547 - cpu_relax(); 548 - } 549 - } 550 - 551 538 static int io_wqe_worker(void *data) 552 539 { 553 540 struct io_worker *worker = data; 554 541 struct io_wqe *wqe = worker->wqe; 555 542 struct io_wq *wq = wqe->wq; 556 - bool did_work; 557 543 558 544 io_worker_start(wqe, worker); 559 545 560 - did_work = false; 561 546 while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { 562 547 set_current_state(TASK_INTERRUPTIBLE); 563 548 loop: 564 - if (did_work) 565 - io_worker_spin_for_work(wqe); 566 549 spin_lock_irq(&wqe->lock); 567 550 if (io_wqe_run_queue(wqe)) { 568 551 __set_current_state(TASK_RUNNING); 569 552 io_worker_handle_work(worker); 570 - did_work = true; 571 553 goto loop; 572 554 } 573 - did_work = false; 574 555 /* drops the lock on success, retry */ 575 556 if (__io_worker_idle(wqe, worker)) { 576 557 __release(&wqe->lock);
+4 -10
fs/io-wq.h
··· 79 79 pid_t task_pid; 80 80 }; 81 81 82 - #define INIT_IO_WORK(work, _func) \ 83 - do { \ 84 - (work)->list.next = NULL; \ 85 - (work)->func = _func; \ 86 - (work)->files = NULL; \ 87 - (work)->mm = NULL; \ 88 - (work)->creds = NULL; \ 89 - (work)->fs = NULL; \ 90 - (work)->flags = 0; \ 91 - } while (0) \ 82 + #define INIT_IO_WORK(work, _func) \ 83 + do { \ 84 + *(work) = (struct io_wq_work){ .func = _func }; \ 85 + } while (0) \ 92 86 93 87 typedef void (get_work_fn)(struct io_wq_work *); 94 88 typedef void (put_work_fn)(struct io_wq_work *);
+70 -62
fs/io_uring.c
··· 183 183 struct file **files; 184 184 }; 185 185 186 - enum { 187 - FFD_F_ATOMIC, 188 - }; 189 - 190 186 struct fixed_file_data { 191 187 struct fixed_file_table *table; 192 188 struct io_ring_ctx *ctx; 193 189 194 190 struct percpu_ref refs; 195 191 struct llist_head put_llist; 196 - unsigned long state; 197 192 struct work_struct ref_work; 198 193 struct completion done; 199 194 }; ··· 1478 1483 __attribute__((nonnull)) 1479 1484 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr) 1480 1485 { 1481 - io_req_find_next(req, nxtptr); 1482 - 1483 - if (refcount_dec_and_test(&req->refs)) 1486 + if (refcount_dec_and_test(&req->refs)) { 1487 + io_req_find_next(req, nxtptr); 1484 1488 __io_free_req(req); 1489 + } 1485 1490 } 1486 1491 1487 1492 static void io_put_req(struct io_kiocb *req) ··· 1816 1821 list_add(&req->list, &ctx->poll_list); 1817 1822 else 1818 1823 list_add_tail(&req->list, &ctx->poll_list); 1824 + 1825 + if ((ctx->flags & IORING_SETUP_SQPOLL) && 1826 + wq_has_sleeper(&ctx->sqo_wait)) 1827 + wake_up(&ctx->sqo_wait); 1819 1828 } 1820 1829 1821 1830 static void io_file_put(struct io_submit_state *state) ··· 2070 2071 ssize_t ret; 2071 2072 ret = import_single_range(rw, buf, sqe_len, *iovec, iter); 2072 2073 *iovec = NULL; 2073 - return ret; 2074 + return ret < 0 ? ret : sqe_len; 2074 2075 } 2075 2076 2076 2077 if (req->io) { ··· 3001 3002 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3002 3003 sr->len = READ_ONCE(sqe->len); 3003 3004 3005 + #ifdef CONFIG_COMPAT 3006 + if (req->ctx->compat) 3007 + sr->msg_flags |= MSG_CMSG_COMPAT; 3008 + #endif 3009 + 3004 3010 if (!io || req->opcode == IORING_OP_SEND) 3005 3011 return 0; 3006 3012 /* iovec is already imported */ ··· 3157 3153 sr->msg_flags = READ_ONCE(sqe->msg_flags); 3158 3154 sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); 3159 3155 sr->len = READ_ONCE(sqe->len); 3156 + 3157 + #ifdef CONFIG_COMPAT 3158 + if (req->ctx->compat) 3159 + sr->msg_flags |= MSG_CMSG_COMPAT; 3160 + #endif 3160 3161 3161 3162 if (!io || req->opcode == IORING_OP_RECV) 3162 3163 return 0; ··· 4714 4705 { 4715 4706 struct io_kiocb *linked_timeout; 4716 4707 struct io_kiocb *nxt = NULL; 4708 + const struct cred *old_creds = NULL; 4717 4709 int ret; 4718 4710 4719 4711 again: 4720 4712 linked_timeout = io_prep_linked_timeout(req); 4713 + 4714 + if (req->work.creds && req->work.creds != current_cred()) { 4715 + if (old_creds) 4716 + revert_creds(old_creds); 4717 + if (old_creds == req->work.creds) 4718 + old_creds = NULL; /* restored original creds */ 4719 + else 4720 + old_creds = override_creds(req->work.creds); 4721 + } 4721 4722 4722 4723 ret = io_issue_sqe(req, sqe, &nxt, true); 4723 4724 ··· 4754 4735 4755 4736 err: 4756 4737 /* drop submission reference */ 4757 - io_put_req(req); 4738 + io_put_req_find_next(req, &nxt); 4758 4739 4759 4740 if (linked_timeout) { 4760 4741 if (!ret) ··· 4778 4759 goto punt; 4779 4760 goto again; 4780 4761 } 4762 + if (old_creds) 4763 + revert_creds(old_creds); 4781 4764 } 4782 4765 4783 4766 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) ··· 4824 4803 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, 4825 4804 struct io_submit_state *state, struct io_kiocb **link) 4826 4805 { 4827 - const struct cred *old_creds = NULL; 4828 4806 struct io_ring_ctx *ctx = req->ctx; 4829 4807 unsigned int sqe_flags; 4830 4808 int ret, id; ··· 4838 4818 4839 4819 id = READ_ONCE(sqe->personality); 4840 4820 if (id) { 4841 - const struct cred *personality_creds; 4842 - 4843 - personality_creds = idr_find(&ctx->personality_idr, id); 4844 - if (unlikely(!personality_creds)) { 4821 + req->work.creds = idr_find(&ctx->personality_idr, id); 4822 + if (unlikely(!req->work.creds)) { 4845 4823 ret = -EINVAL; 4846 4824 goto err_req; 4847 4825 } 4848 - old_creds = override_creds(personality_creds); 4826 + get_cred(req->work.creds); 4849 4827 } 4850 4828 4851 4829 /* same numerical values with corresponding REQ_F_*, safe to copy */ ··· 4855 4837 err_req: 4856 4838 io_cqring_add_event(req, ret); 4857 4839 io_double_put_req(req); 4858 - if (old_creds) 4859 - revert_creds(old_creds); 4860 4840 return false; 4861 4841 } 4862 4842 ··· 4915 4899 } 4916 4900 } 4917 4901 4918 - if (old_creds) 4919 - revert_creds(old_creds); 4920 4902 return true; 4921 4903 } 4922 4904 ··· 5095 5081 const struct cred *old_cred; 5096 5082 mm_segment_t old_fs; 5097 5083 DEFINE_WAIT(wait); 5098 - unsigned inflight; 5099 5084 unsigned long timeout; 5100 - int ret; 5085 + int ret = 0; 5101 5086 5102 5087 complete(&ctx->completions[1]); 5103 5088 ··· 5104 5091 set_fs(USER_DS); 5105 5092 old_cred = override_creds(ctx->creds); 5106 5093 5107 - ret = timeout = inflight = 0; 5094 + timeout = jiffies + ctx->sq_thread_idle; 5108 5095 while (!kthread_should_park()) { 5109 5096 unsigned int to_submit; 5110 5097 5111 - if (inflight) { 5098 + if (!list_empty(&ctx->poll_list)) { 5112 5099 unsigned nr_events = 0; 5113 5100 5114 - if (ctx->flags & IORING_SETUP_IOPOLL) { 5115 - /* 5116 - * inflight is the count of the maximum possible 5117 - * entries we submitted, but it can be smaller 5118 - * if we dropped some of them. If we don't have 5119 - * poll entries available, then we know that we 5120 - * have nothing left to poll for. Reset the 5121 - * inflight count to zero in that case. 5122 - */ 5123 - mutex_lock(&ctx->uring_lock); 5124 - if (!list_empty(&ctx->poll_list)) 5125 - io_iopoll_getevents(ctx, &nr_events, 0); 5126 - else 5127 - inflight = 0; 5128 - mutex_unlock(&ctx->uring_lock); 5129 - } else { 5130 - /* 5131 - * Normal IO, just pretend everything completed. 5132 - * We don't have to poll completions for that. 5133 - */ 5134 - nr_events = inflight; 5135 - } 5136 - 5137 - inflight -= nr_events; 5138 - if (!inflight) 5101 + mutex_lock(&ctx->uring_lock); 5102 + if (!list_empty(&ctx->poll_list)) 5103 + io_iopoll_getevents(ctx, &nr_events, 0); 5104 + else 5139 5105 timeout = jiffies + ctx->sq_thread_idle; 5106 + mutex_unlock(&ctx->uring_lock); 5140 5107 } 5141 5108 5142 5109 to_submit = io_sqring_entries(ctx); ··· 5145 5152 * more IO, we should wait for the application to 5146 5153 * reap events and wake us up. 5147 5154 */ 5148 - if (inflight || 5155 + if (!list_empty(&ctx->poll_list) || 5149 5156 (!time_after(jiffies, timeout) && ret != -EBUSY && 5150 5157 !percpu_ref_is_dying(&ctx->refs))) { 5151 5158 cond_resched(); ··· 5154 5161 5155 5162 prepare_to_wait(&ctx->sqo_wait, &wait, 5156 5163 TASK_INTERRUPTIBLE); 5164 + 5165 + /* 5166 + * While doing polled IO, before going to sleep, we need 5167 + * to check if there are new reqs added to poll_list, it 5168 + * is because reqs may have been punted to io worker and 5169 + * will be added to poll_list later, hence check the 5170 + * poll_list again. 5171 + */ 5172 + if ((ctx->flags & IORING_SETUP_IOPOLL) && 5173 + !list_empty_careful(&ctx->poll_list)) { 5174 + finish_wait(&ctx->sqo_wait, &wait); 5175 + continue; 5176 + } 5157 5177 5158 5178 /* Tell userspace we may need a wakeup call */ 5159 5179 ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; ··· 5195 5189 mutex_lock(&ctx->uring_lock); 5196 5190 ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); 5197 5191 mutex_unlock(&ctx->uring_lock); 5198 - if (ret > 0) 5199 - inflight += ret; 5192 + timeout = jiffies + ctx->sq_thread_idle; 5200 5193 } 5201 5194 5202 5195 set_fs(old_fs); ··· 5600 5595 5601 5596 data = container_of(work, struct fixed_file_data, ref_work); 5602 5597 io_ring_file_ref_flush(data); 5603 - percpu_ref_get(&data->refs); 5604 5598 percpu_ref_switch_to_percpu(&data->refs); 5605 5599 } 5606 5600 ··· 5775 5771 { 5776 5772 struct fixed_file_data *data; 5777 5773 5774 + /* 5775 + * Juggle reference to ensure we hit zero, if needed, so we can 5776 + * switch back to percpu mode 5777 + */ 5778 5778 data = container_of(ref, struct fixed_file_data, refs); 5779 - clear_bit(FFD_F_ATOMIC, &data->state); 5779 + percpu_ref_put(&data->refs); 5780 + percpu_ref_get(&data->refs); 5780 5781 } 5781 5782 5782 5783 static bool io_queue_file_removal(struct fixed_file_data *data, ··· 5804 5795 llist_add(&pfile->llist, &data->put_llist); 5805 5796 5806 5797 if (pfile == &pfile_stack) { 5807 - if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) { 5808 - percpu_ref_put(&data->refs); 5809 - percpu_ref_switch_to_atomic(&data->refs, 5810 - io_atomic_switch); 5811 - } 5798 + percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); 5812 5799 wait_for_completion(&done); 5813 5800 flush_work(&data->ref_work); 5814 5801 return false; ··· 5878 5873 up->offset++; 5879 5874 } 5880 5875 5881 - if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) { 5882 - percpu_ref_put(&data->refs); 5876 + if (ref_switch) 5883 5877 percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch); 5884 - } 5885 5878 5886 5879 return done ? done : err; 5887 5880 } ··· 6337 6334 io_sqe_buffer_unregister(ctx); 6338 6335 io_sqe_files_unregister(ctx); 6339 6336 io_eventfd_unregister(ctx); 6337 + idr_destroy(&ctx->personality_idr); 6340 6338 6341 6339 #if defined(CONFIG_UNIX) 6342 6340 if (ctx->ring_sock) { ··· 6651 6647 return submitted ? submitted : ret; 6652 6648 } 6653 6649 6650 + #ifdef CONFIG_PROC_FS 6654 6651 static int io_uring_show_cred(int id, void *p, void *data) 6655 6652 { 6656 6653 const struct cred *cred = p; ··· 6725 6720 percpu_ref_put(&ctx->refs); 6726 6721 } 6727 6722 } 6723 + #endif 6728 6724 6729 6725 static const struct file_operations io_uring_fops = { 6730 6726 .release = io_uring_release, ··· 6737 6731 #endif 6738 6732 .poll = io_uring_poll, 6739 6733 .fasync = io_uring_fasync, 6734 + #ifdef CONFIG_PROC_FS 6740 6735 .show_fdinfo = io_uring_show_fdinfo, 6736 + #endif 6741 6737 }; 6742 6738 6743 6739 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,