Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'io_uring-6.9-20240322' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:
"One patch just missed the initial pull, the rest are either fixes or
small cleanups that make our life easier for the next kernel:

- Fix a potential leak in error handling of pinned pages, and clean
it up (Gabriel, Pavel)

- Fix an issue with how read multishot returns retry (me)

- Fix a problem with waitid/futex removals, if we hit the case of
needing to remove all of them at exit time (me)

- Fix for a regression introduced in this merge window, where we
don't always have sr->done_io initialized if the ->prep_async()
path is used (me)

- Fix for SQPOLL setup error handling (me)

- Fix for a poll removal request being delayed (Pavel)

- Rename of a struct member which had a confusing name (Pavel)"

* tag 'io_uring-6.9-20240322' of git://git.kernel.dk/linux:
io_uring/sqpoll: early exit thread if task_context wasn't allocated
io_uring: clear opcode specific data for an early failure
io_uring/net: ensure async prep handlers always initialize ->done_io
io_uring/waitid: always remove waitid entry for cancel all
io_uring/futex: always remove futex entry for cancel all
io_uring: fix poll_remove stalled req completion
io_uring: Fix release of pinned pages when __io_uaddr_map fails
io_uring/kbuf: rename is_mapped
io_uring: simplify io_pages_free
io_uring: clean rings on NO_MMAP alloc fail
io_uring/rw: return IOU_ISSUE_SKIP_COMPLETE for multishot retry
io_uring: don't save/restore iowait state

+65 -49
+1
io_uring/futex.c
··· 159 159 hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { 160 160 if (!io_match_task_safe(req, task, cancel_all)) 161 161 continue; 162 + hlist_del_init(&req->hash_node); 162 163 __io_futex_cancel(ctx, req); 163 164 found = true; 164 165 }
+35 -28
io_uring/io_uring.c
··· 2181 2181 } 2182 2182 } 2183 2183 2184 + static __cold int io_init_fail_req(struct io_kiocb *req, int err) 2185 + { 2186 + /* ensure per-opcode data is cleared if we fail before prep */ 2187 + memset(&req->cmd.data, 0, sizeof(req->cmd.data)); 2188 + return err; 2189 + } 2190 + 2184 2191 static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, 2185 2192 const struct io_uring_sqe *sqe) 2186 2193 __must_hold(&ctx->uring_lock) ··· 2209 2202 2210 2203 if (unlikely(opcode >= IORING_OP_LAST)) { 2211 2204 req->opcode = 0; 2212 - return -EINVAL; 2205 + return io_init_fail_req(req, -EINVAL); 2213 2206 } 2214 2207 def = &io_issue_defs[opcode]; 2215 2208 if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { 2216 2209 /* enforce forwards compatibility on users */ 2217 2210 if (sqe_flags & ~SQE_VALID_FLAGS) 2218 - return -EINVAL; 2211 + return io_init_fail_req(req, -EINVAL); 2219 2212 if (sqe_flags & IOSQE_BUFFER_SELECT) { 2220 2213 if (!def->buffer_select) 2221 - return -EOPNOTSUPP; 2214 + return io_init_fail_req(req, -EOPNOTSUPP); 2222 2215 req->buf_index = READ_ONCE(sqe->buf_group); 2223 2216 } 2224 2217 if (sqe_flags & IOSQE_CQE_SKIP_SUCCESS) 2225 2218 ctx->drain_disabled = true; 2226 2219 if (sqe_flags & IOSQE_IO_DRAIN) { 2227 2220 if (ctx->drain_disabled) 2228 - return -EOPNOTSUPP; 2221 + return io_init_fail_req(req, -EOPNOTSUPP); 2229 2222 io_init_req_drain(req); 2230 2223 } 2231 2224 } 2232 2225 if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { 2233 2226 if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) 2234 - return -EACCES; 2227 + return io_init_fail_req(req, -EACCES); 2235 2228 /* knock it to the slow queue path, will be drained there */ 2236 2229 if (ctx->drain_active) 2237 2230 req->flags |= REQ_F_FORCE_ASYNC; ··· 2244 2237 } 2245 2238 2246 2239 if (!def->ioprio && sqe->ioprio) 2247 - return -EINVAL; 2240 + return io_init_fail_req(req, -EINVAL); 2248 2241 if (!def->iopoll && (ctx->flags & IORING_SETUP_IOPOLL)) 2249 - return -EINVAL; 2242 + return io_init_fail_req(req, -EINVAL); 2250 2243 2251 2244 if (def->needs_file) { 2252 2245 struct io_submit_state *state = &ctx->submit_state; ··· 2270 2263 2271 2264 req->creds = xa_load(&ctx->personalities, personality); 2272 2265 if (!req->creds) 2273 - return -EINVAL; 2266 + return io_init_fail_req(req, -EINVAL); 2274 2267 get_cred(req->creds); 2275 2268 ret = security_uring_override_creds(req->creds); 2276 2269 if (ret) { 2277 2270 put_cred(req->creds); 2278 - return ret; 2271 + return io_init_fail_req(req, ret); 2279 2272 } 2280 2273 req->flags |= REQ_F_CREDS; 2281 2274 } ··· 2546 2539 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, 2547 2540 struct io_wait_queue *iowq) 2548 2541 { 2549 - int io_wait, ret; 2542 + int ret; 2550 2543 2551 2544 if (unlikely(READ_ONCE(ctx->check_cq))) 2552 2545 return 1; ··· 2564 2557 * can take into account that the task is waiting for IO - turns out 2565 2558 * to be important for low QD IO. 2566 2559 */ 2567 - io_wait = current->in_iowait; 2568 2560 if (current_pending_io()) 2569 2561 current->in_iowait = 1; 2570 2562 ret = 0; ··· 2571 2565 schedule(); 2572 2566 else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) 2573 2567 ret = -ETIME; 2574 - current->in_iowait = io_wait; 2568 + current->in_iowait = 0; 2575 2569 return ret; 2576 2570 } 2577 2571 ··· 2703 2697 2704 2698 static void io_pages_free(struct page ***pages, int npages) 2705 2699 { 2706 - struct page **page_array; 2700 + struct page **page_array = *pages; 2707 2701 int i; 2708 2702 2709 - if (!pages) 2710 - return; 2711 - 2712 - page_array = *pages; 2713 2703 if (!page_array) 2714 2704 return; 2715 2705 ··· 2721 2719 struct page **page_array; 2722 2720 unsigned int nr_pages; 2723 2721 void *page_addr; 2724 - int ret, i; 2722 + int ret, i, pinned; 2725 2723 2726 2724 *npages = 0; 2727 2725 ··· 2735 2733 if (!page_array) 2736 2734 return ERR_PTR(-ENOMEM); 2737 2735 2738 - ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 2739 - page_array); 2740 - if (ret != nr_pages) { 2741 - err: 2742 - io_pages_free(&page_array, ret > 0 ? ret : 0); 2743 - return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); 2736 + 2737 + pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, 2738 + page_array); 2739 + if (pinned != nr_pages) { 2740 + ret = (pinned < 0) ? pinned : -EFAULT; 2741 + goto free_pages; 2744 2742 } 2745 2743 2746 2744 page_addr = page_address(page_array[0]); ··· 2754 2752 * didn't support this feature. 2755 2753 */ 2756 2754 if (PageHighMem(page_array[i])) 2757 - goto err; 2755 + goto free_pages; 2758 2756 2759 2757 /* 2760 2758 * No support for discontig pages for now, should either be a ··· 2763 2761 * just fail them with EINVAL. 2764 2762 */ 2765 2763 if (page_address(page_array[i]) != page_addr) 2766 - goto err; 2764 + goto free_pages; 2767 2765 page_addr += PAGE_SIZE; 2768 2766 } 2769 2767 2770 2768 *pages = page_array; 2771 2769 *npages = nr_pages; 2772 2770 return page_to_virt(page_array[0]); 2771 + 2772 + free_pages: 2773 + io_pages_free(&page_array, pinned > 0 ? pinned : 0); 2774 + return ERR_PTR(ret); 2773 2775 } 2774 2776 2775 2777 static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, ··· 2795 2789 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { 2796 2790 io_mem_free(ctx->rings); 2797 2791 io_mem_free(ctx->sq_sqes); 2798 - ctx->rings = NULL; 2799 - ctx->sq_sqes = NULL; 2800 2792 } else { 2801 2793 io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); 2802 2794 ctx->n_ring_pages = 0; 2803 2795 io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); 2804 2796 ctx->n_sqe_pages = 0; 2805 2797 } 2798 + 2799 + ctx->rings = NULL; 2800 + ctx->sq_sqes = NULL; 2806 2801 } 2807 2802 2808 2803 void *io_mem_alloc(size_t size)
+10 -10
io_uring/kbuf.c
··· 199 199 200 200 bl = io_buffer_get_list(ctx, req->buf_index); 201 201 if (likely(bl)) { 202 - if (bl->is_mapped) 202 + if (bl->is_buf_ring) 203 203 ret = io_ring_buffer_select(req, len, bl, issue_flags); 204 204 else 205 205 ret = io_provided_buffer_select(req, len, bl); ··· 253 253 if (!nbufs) 254 254 return 0; 255 255 256 - if (bl->is_mapped) { 256 + if (bl->is_buf_ring) { 257 257 i = bl->buf_ring->tail - bl->head; 258 258 if (bl->is_mmap) { 259 259 /* ··· 274 274 } 275 275 /* make sure it's seen as empty */ 276 276 INIT_LIST_HEAD(&bl->buf_list); 277 - bl->is_mapped = 0; 277 + bl->is_buf_ring = 0; 278 278 return i; 279 279 } 280 280 ··· 361 361 if (bl) { 362 362 ret = -EINVAL; 363 363 /* can't use provide/remove buffers command on mapped buffers */ 364 - if (!bl->is_mapped) 364 + if (!bl->is_buf_ring) 365 365 ret = __io_remove_buffers(ctx, bl, p->nbufs); 366 366 } 367 367 io_ring_submit_unlock(ctx, issue_flags); ··· 519 519 } 520 520 } 521 521 /* can't add buffers via this command for a mapped buffer ring */ 522 - if (bl->is_mapped) { 522 + if (bl->is_buf_ring) { 523 523 ret = -EINVAL; 524 524 goto err; 525 525 } ··· 575 575 bl->buf_pages = pages; 576 576 bl->buf_nr_pages = nr_pages; 577 577 bl->buf_ring = br; 578 - bl->is_mapped = 1; 578 + bl->is_buf_ring = 1; 579 579 bl->is_mmap = 0; 580 580 return 0; 581 581 error_unpin: ··· 642 642 } 643 643 ibf->inuse = 1; 644 644 bl->buf_ring = ibf->mem; 645 - bl->is_mapped = 1; 645 + bl->is_buf_ring = 1; 646 646 bl->is_mmap = 1; 647 647 return 0; 648 648 } ··· 688 688 bl = io_buffer_get_list(ctx, reg.bgid); 689 689 if (bl) { 690 690 /* if mapped buffer ring OR classic exists, don't allow */ 691 - if (bl->is_mapped || !list_empty(&bl->buf_list)) 691 + if (bl->is_buf_ring || !list_empty(&bl->buf_list)) 692 692 return -EEXIST; 693 693 } else { 694 694 free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); ··· 730 730 bl = io_buffer_get_list(ctx, reg.bgid); 731 731 if (!bl) 732 732 return -ENOENT; 733 - if (!bl->is_mapped) 733 + if (!bl->is_buf_ring) 734 734 return -EINVAL; 735 735 736 736 __io_remove_buffers(ctx, bl, -1U); ··· 757 757 bl = io_buffer_get_list(ctx, buf_status.buf_group); 758 758 if (!bl) 759 759 return -ENOENT; 760 - if (!bl->is_mapped) 760 + if (!bl->is_buf_ring) 761 761 return -EINVAL; 762 762 763 763 buf_status.head = bl->head;
+1 -1
io_uring/kbuf.h
··· 26 26 __u16 mask; 27 27 28 28 /* ring mapped provided buffers */ 29 - __u8 is_mapped; 29 + __u8 is_buf_ring; 30 30 /* ring mapped provided buffers, but mmap'ed by application */ 31 31 __u8 is_mmap; 32 32 /* bl is visible from an RCU point of view for lookup */
+8 -1
io_uring/net.c
··· 326 326 struct io_async_msghdr *io; 327 327 int ret; 328 328 329 - if (!zc->addr || req_has_async_data(req)) 329 + if (req_has_async_data(req)) 330 + return 0; 331 + zc->done_io = 0; 332 + if (!zc->addr) 330 333 return 0; 331 334 io = io_msg_alloc_async_prep(req); 332 335 if (!io) ··· 356 353 357 354 int io_sendmsg_prep_async(struct io_kiocb *req) 358 355 { 356 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 359 357 int ret; 360 358 359 + sr->done_io = 0; 361 360 if (!io_msg_alloc_async_prep(req)) 362 361 return -ENOMEM; 363 362 ret = io_sendmsg_copy_hdr(req, req->async_data); ··· 613 608 614 609 int io_recvmsg_prep_async(struct io_kiocb *req) 615 610 { 611 + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); 616 612 struct io_async_msghdr *iomsg; 617 613 int ret; 618 614 615 + sr->done_io = 0; 619 616 if (!io_msg_alloc_async_prep(req)) 620 617 return -ENOMEM; 621 618 iomsg = req->async_data;
+2 -2
io_uring/poll.c
··· 996 996 struct io_hash_bucket *bucket; 997 997 struct io_kiocb *preq; 998 998 int ret2, ret = 0; 999 - struct io_tw_state ts = { .locked = true }; 1000 999 1001 1000 io_ring_submit_lock(ctx, issue_flags); 1002 1001 preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ··· 1044 1045 1045 1046 req_set_fail(preq); 1046 1047 io_req_set_res(preq, -ECANCELED, 0); 1047 - io_req_task_complete(preq, &ts); 1048 + preq->io_task_work.func = io_req_task_complete; 1049 + io_req_task_work_add(preq); 1048 1050 out: 1049 1051 io_ring_submit_unlock(ctx, issue_flags); 1050 1052 if (ret < 0) {
+2
io_uring/rw.c
··· 947 947 */ 948 948 if (io_kbuf_recycle(req, issue_flags)) 949 949 rw->len = 0; 950 + if (issue_flags & IO_URING_F_MULTISHOT) 951 + return IOU_ISSUE_SKIP_COMPLETE; 950 952 return -EAGAIN; 951 953 } 952 954
+5 -1
io_uring/sqpoll.c
··· 274 274 char buf[TASK_COMM_LEN]; 275 275 DEFINE_WAIT(wait); 276 276 277 + /* offload context creation failed, just exit */ 278 + if (!current->io_uring) 279 + goto err_out; 280 + 277 281 snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); 278 282 set_task_comm(current, buf); 279 283 ··· 375 371 atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); 376 372 io_run_task_work(); 377 373 mutex_unlock(&sqd->lock); 378 - 374 + err_out: 379 375 complete(&sqd->exited); 380 376 do_exit(0); 381 377 }
+1 -6
io_uring/waitid.c
··· 125 125 126 126 lockdep_assert_held(&req->ctx->uring_lock); 127 127 128 - /* 129 - * Did cancel find it meanwhile? 130 - */ 131 - if (hlist_unhashed(&req->hash_node)) 132 - return; 133 - 134 128 hlist_del_init(&req->hash_node); 135 129 136 130 ret = io_waitid_finish(req, ret); ··· 196 202 hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { 197 203 if (!io_match_task_safe(req, task, cancel_all)) 198 204 continue; 205 + hlist_del_init(&req->hash_node); 199 206 __io_waitid_cancel(ctx, req); 200 207 found = true; 201 208 }