Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring

Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.

This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+46 -156
-3
include/linux/io_uring_types.h
··· 372 372 373 373 struct list_head io_buffers_cache; 374 374 375 - /* deferred free list, protected by ->uring_lock */ 376 - struct hlist_head io_buf_list; 377 - 378 375 /* Keep this last, we don't need it for the fast path */ 379 376 struct wait_queue_head poll_wq; 380 377 struct io_restriction restrictions;
+16 -42
io_uring/io_uring.c
··· 303 303 INIT_LIST_HEAD(&ctx->sqd_list); 304 304 INIT_LIST_HEAD(&ctx->cq_overflow_list); 305 305 INIT_LIST_HEAD(&ctx->io_buffers_cache); 306 - INIT_HLIST_HEAD(&ctx->io_buf_list); 307 306 ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, 308 307 sizeof(struct io_rsrc_node)); 309 308 ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, ··· 2597 2598 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 2598 2599 } 2599 2600 2600 - static void io_pages_unmap(void *ptr, struct page ***pages, 2601 - unsigned short *npages) 2601 + void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, 2602 + bool put_pages) 2602 2603 { 2603 2604 bool do_vunmap = false; 2604 2605 2605 2606 if (!ptr) 2606 2607 return; 2607 2608 2608 - if (*npages) { 2609 + if (put_pages && *npages) { 2609 2610 struct page **to_free = *pages; 2610 2611 int i; 2611 2612 ··· 2625 2626 kvfree(*pages); 2626 2627 *pages = NULL; 2627 2628 *npages = 0; 2628 - } 2629 - 2630 - void io_mem_free(void *ptr) 2631 - { 2632 - if (!ptr) 2633 - return; 2634 - 2635 - folio_put(virt_to_folio(ptr)); 2636 2629 } 2637 2630 2638 2631 static void io_pages_free(struct page ***pages, int npages) ··· 2721 2730 static void io_rings_free(struct io_ring_ctx *ctx) 2722 2731 { 2723 2732 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { 2724 - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); 2725 - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); 2733 + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, 2734 + true); 2735 + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, 2736 + true); 2726 2737 } else { 2727 2738 io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); 2728 2739 ctx->n_ring_pages = 0; ··· 2781 2788 return ERR_PTR(-ENOMEM); 2782 2789 } 2783 2790 2784 - static void *io_pages_map(struct page ***out_pages, unsigned short *npages, 2785 - size_t size) 2791 + void *io_pages_map(struct page ***out_pages, unsigned short *npages, 2792 + size_t size) 2786 2793 { 2787 2794 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 2788 2795 struct page **pages; ··· 2810 2817 *out_pages = NULL; 2811 2818 *npages = 0; 2812 2819 return ret; 2813 - } 2814 - 2815 - void *io_mem_alloc(size_t size) 2816 - { 2817 - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; 2818 - void *ret; 2819 - 2820 - ret = (void *) __get_free_pages(gfp, get_order(size)); 2821 - if (ret) 2822 - return ret; 2823 - return ERR_PTR(-ENOMEM); 2824 2820 } 2825 2821 2826 2822 static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, ··· 2908 2926 ctx->mm_account = NULL; 2909 2927 } 2910 2928 io_rings_free(ctx); 2911 - io_kbuf_mmap_list_free(ctx); 2912 2929 2913 2930 percpu_ref_exit(&ctx->refs); 2914 2931 free_uid(ctx->user); ··· 3377 3396 { 3378 3397 struct io_ring_ctx *ctx = file->private_data; 3379 3398 loff_t offset = pgoff << PAGE_SHIFT; 3380 - struct page *page; 3381 - void *ptr; 3382 3399 3383 - switch (offset & IORING_OFF_MMAP_MASK) { 3400 + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { 3384 3401 case IORING_OFF_SQ_RING: 3385 3402 case IORING_OFF_CQ_RING: 3386 3403 /* Don't allow mmap if the ring was setup without it */ ··· 3393 3414 case IORING_OFF_PBUF_RING: { 3394 3415 struct io_buffer_list *bl; 3395 3416 unsigned int bgid; 3417 + void *ptr; 3396 3418 3397 3419 bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 3398 3420 bl = io_pbuf_get_bl(ctx, bgid); ··· 3401 3421 return bl; 3402 3422 ptr = bl->buf_ring; 3403 3423 io_put_bl(ctx, bl); 3404 - break; 3424 + return ptr; 3405 3425 } 3406 - default: 3407 - return ERR_PTR(-EINVAL); 3408 3426 } 3409 3427 3410 - page = virt_to_head_page(ptr); 3411 - if (sz > page_size(page)) 3412 - return ERR_PTR(-EINVAL); 3413 - 3414 - return ptr; 3428 + return ERR_PTR(-EINVAL); 3415 3429 } 3416 3430 3417 3431 int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, ··· 3424 3450 struct io_ring_ctx *ctx = file->private_data; 3425 3451 size_t sz = vma->vm_end - vma->vm_start; 3426 3452 long offset = vma->vm_pgoff << PAGE_SHIFT; 3427 - unsigned long pfn; 3428 3453 void *ptr; 3429 3454 3430 3455 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); ··· 3438 3465 case IORING_OFF_SQES: 3439 3466 return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, 3440 3467 ctx->n_sqe_pages); 3468 + case IORING_OFF_PBUF_RING: 3469 + return io_pbuf_mmap(file, vma); 3441 3470 } 3442 3471 3443 - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 3444 - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); 3472 + return -EINVAL; 3445 3473 } 3446 3474 3447 3475 static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
+4 -2
io_uring/io_uring.h
··· 109 109 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, 110 110 bool cancel_all); 111 111 112 - void *io_mem_alloc(size_t size); 113 - void io_mem_free(void *ptr); 112 + void *io_pages_map(struct page ***out_pages, unsigned short *npages, 113 + size_t size); 114 + void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, 115 + bool put_pages); 114 116 115 117 enum { 116 118 IO_EVENTFD_OP_SIGNAL_BIT,
+25 -107
io_uring/kbuf.c
··· 32 32 __u16 bid; 33 33 }; 34 34 35 - struct io_buf_free { 36 - struct hlist_node list; 37 - void *mem; 38 - size_t size; 39 - int inuse; 40 - }; 41 - 42 - static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, 43 - unsigned int bgid) 44 - { 45 - return xa_load(&ctx->io_bl_xa, bgid); 46 - } 47 - 48 35 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, 49 36 unsigned int bgid) 50 37 { 51 38 lockdep_assert_held(&ctx->uring_lock); 52 39 53 - return __io_buffer_get_list(ctx, bgid); 40 + return xa_load(&ctx->io_bl_xa, bgid); 54 41 } 55 42 56 43 static int io_buffer_add_list(struct io_ring_ctx *ctx, ··· 178 191 return ret; 179 192 } 180 193 181 - /* 182 - * Mark the given mapped range as free for reuse 183 - */ 184 - static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) 185 - { 186 - struct io_buf_free *ibf; 187 - 188 - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 189 - if (bl->buf_ring == ibf->mem) { 190 - ibf->inuse = 0; 191 - return; 192 - } 193 - } 194 - 195 - /* can't happen... */ 196 - WARN_ON_ONCE(1); 197 - } 198 - 199 194 static int __io_remove_buffers(struct io_ring_ctx *ctx, 200 195 struct io_buffer_list *bl, unsigned nbufs) 201 196 { ··· 189 220 190 221 if (bl->is_buf_ring) { 191 222 i = bl->buf_ring->tail - bl->head; 192 - if (bl->is_mmap) { 193 - /* 194 - * io_kbuf_list_free() will free the page(s) at 195 - * ->release() time. 196 - */ 197 - io_kbuf_mark_free(ctx, bl); 198 - bl->buf_ring = NULL; 199 - bl->is_mmap = 0; 200 - } else if (bl->buf_nr_pages) { 223 + if (bl->buf_nr_pages) { 201 224 int j; 202 225 203 - for (j = 0; j < bl->buf_nr_pages; j++) 204 - unpin_user_page(bl->buf_pages[j]); 205 - kvfree(bl->buf_pages); 206 - vunmap(bl->buf_ring); 207 - bl->buf_pages = NULL; 208 - bl->buf_nr_pages = 0; 226 + if (!bl->is_mmap) { 227 + for (j = 0; j < bl->buf_nr_pages; j++) 228 + unpin_user_page(bl->buf_pages[j]); 229 + } 230 + io_pages_unmap(bl->buf_ring, &bl->buf_pages, 231 + &bl->buf_nr_pages, bl->is_mmap); 232 + bl->is_mmap = 0; 209 233 } 210 234 /* make sure it's seen as empty */ 211 235 INIT_LIST_HEAD(&bl->buf_list); ··· 499 537 return ret; 500 538 } 501 539 502 - /* 503 - * See if we have a suitable region that we can reuse, rather than allocate 504 - * both a new io_buf_free and mem region again. We leave it on the list as 505 - * even a reused entry will need freeing at ring release. 506 - */ 507 - static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, 508 - size_t ring_size) 509 - { 510 - struct io_buf_free *ibf, *best = NULL; 511 - size_t best_dist; 512 - 513 - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { 514 - size_t dist; 515 - 516 - if (ibf->inuse || ibf->size < ring_size) 517 - continue; 518 - dist = ibf->size - ring_size; 519 - if (!best || dist < best_dist) { 520 - best = ibf; 521 - if (!dist) 522 - break; 523 - best_dist = dist; 524 - } 525 - } 526 - 527 - return best; 528 - } 529 - 530 540 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, 531 541 struct io_uring_buf_reg *reg, 532 542 struct io_buffer_list *bl) 533 543 { 534 - struct io_buf_free *ibf; 535 544 size_t ring_size; 536 - void *ptr; 537 545 538 546 ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); 539 547 540 - /* Reuse existing entry, if we can */ 541 - ibf = io_lookup_buf_free_entry(ctx, ring_size); 542 - if (!ibf) { 543 - ptr = io_mem_alloc(ring_size); 544 - if (IS_ERR(ptr)) 545 - return PTR_ERR(ptr); 548 + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); 549 + if (!bl->buf_ring) 550 + return -ENOMEM; 546 551 547 - /* Allocate and store deferred free entry */ 548 - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); 549 - if (!ibf) { 550 - io_mem_free(ptr); 551 - return -ENOMEM; 552 - } 553 - ibf->mem = ptr; 554 - ibf->size = ring_size; 555 - hlist_add_head(&ibf->list, &ctx->io_buf_list); 556 - } 557 - ibf->inuse = 1; 558 - bl->buf_ring = ibf->mem; 559 552 bl->is_buf_ring = 1; 560 553 bl->is_mmap = 1; 561 554 return 0; ··· 658 741 return ERR_PTR(-EINVAL); 659 742 } 660 743 661 - /* 662 - * Called at or after ->release(), free the mmap'ed buffers that we used 663 - * for memory mapped provided buffer rings. 664 - */ 665 - void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) 744 + int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) 666 745 { 667 - struct io_buf_free *ibf; 668 - struct hlist_node *tmp; 746 + struct io_ring_ctx *ctx = file->private_data; 747 + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; 748 + struct io_buffer_list *bl; 749 + int bgid, ret; 669 750 670 - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { 671 - hlist_del(&ibf->list); 672 - io_mem_free(ibf->mem); 673 - kfree(ibf); 674 - } 751 + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; 752 + bl = io_pbuf_get_bl(ctx, bgid); 753 + if (IS_ERR(bl)) 754 + return PTR_ERR(bl); 755 + 756 + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); 757 + io_put_bl(ctx, bl); 758 + return ret; 675 759 }
+1 -2
io_uring/kbuf.h
··· 55 55 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); 56 56 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); 57 57 58 - void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); 59 - 60 58 void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); 61 59 62 60 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); ··· 62 64 void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); 63 65 struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, 64 66 unsigned long bgid); 67 + int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); 65 68 66 69 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) 67 70 {