Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

io_uring: get rid of remap_pfn_range() for mapping rings/sqes

Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_pages() and have it Just Work.

If possible, allocate a single compound page that covers the range that
is needed. If that works, then we can just use page_address() on that
page. If we fail to get a compound page, allocate single pages and use
vmap() to map them into the kernel virtual address space.

This just covers the rings/sqes, the other remaining user of the mmap
remap_pfn_range() user will be converted separately. Once that is done,
we can kill the old alloc/free code.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

+133 -8
+131 -8
io_uring/io_uring.c
··· 2599 2599 return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; 2600 2600 } 2601 2601 2602 + static void io_pages_unmap(void *ptr, struct page ***pages, 2603 + unsigned short *npages) 2604 + { 2605 + bool do_vunmap = false; 2606 + 2607 + if (!ptr) 2608 + return; 2609 + 2610 + if (*npages) { 2611 + struct page **to_free = *pages; 2612 + int i; 2613 + 2614 + /* 2615 + * Only did vmap for the non-compound multiple page case. 2616 + * For the compound page, we just need to put the head. 2617 + */ 2618 + if (PageCompound(to_free[0])) 2619 + *npages = 1; 2620 + else if (*npages > 1) 2621 + do_vunmap = true; 2622 + for (i = 0; i < *npages; i++) 2623 + put_page(to_free[i]); 2624 + } 2625 + if (do_vunmap) 2626 + vunmap(ptr); 2627 + kvfree(*pages); 2628 + *pages = NULL; 2629 + *npages = 0; 2630 + } 2631 + 2602 2632 void io_mem_free(void *ptr) 2603 2633 { 2604 2634 if (!ptr) ··· 2729 2699 static void io_rings_free(struct io_ring_ctx *ctx) 2730 2700 { 2731 2701 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { 2732 - io_mem_free(ctx->rings); 2733 - io_mem_free(ctx->sq_sqes); 2702 + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); 2703 + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); 2734 2704 } else { 2735 2705 io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); 2736 2706 ctx->n_ring_pages = 0; ··· 2740 2710 2741 2711 ctx->rings = NULL; 2742 2712 ctx->sq_sqes = NULL; 2713 + } 2714 + 2715 + static void *io_mem_alloc_compound(struct page **pages, int nr_pages, 2716 + size_t size, gfp_t gfp) 2717 + { 2718 + struct page *page; 2719 + int i, order; 2720 + 2721 + order = get_order(size); 2722 + if (order > MAX_PAGE_ORDER) 2723 + return ERR_PTR(-ENOMEM); 2724 + else if (order) 2725 + gfp |= __GFP_COMP; 2726 + 2727 + page = alloc_pages(gfp, order); 2728 + if (!page) 2729 + return ERR_PTR(-ENOMEM); 2730 + 2731 + for (i = 0; i < nr_pages; i++) 2732 + pages[i] = page + i; 2733 + 2734 + return page_address(page); 2735 + } 2736 + 2737 + static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, 2738 + gfp_t gfp) 2739 + { 2740 + void *ret; 2741 + int i; 2742 + 2743 + for (i = 0; i < nr_pages; i++) { 2744 + pages[i] = alloc_page(gfp); 2745 + if (!pages[i]) 2746 + goto err; 2747 + } 2748 + 2749 + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); 2750 + if (ret) 2751 + return ret; 2752 + err: 2753 + while (i--) 2754 + put_page(pages[i]); 2755 + return ERR_PTR(-ENOMEM); 2756 + } 2757 + 2758 + static void *io_pages_map(struct page ***out_pages, unsigned short *npages, 2759 + size_t size) 2760 + { 2761 + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; 2762 + struct page **pages; 2763 + int nr_pages; 2764 + void *ret; 2765 + 2766 + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2767 + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); 2768 + if (!pages) 2769 + return ERR_PTR(-ENOMEM); 2770 + 2771 + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); 2772 + if (!IS_ERR(ret)) 2773 + goto done; 2774 + 2775 + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); 2776 + if (!IS_ERR(ret)) { 2777 + done: 2778 + *out_pages = pages; 2779 + *npages = nr_pages; 2780 + return ret; 2781 + } 2782 + 2783 + kvfree(pages); 2784 + *out_pages = NULL; 2785 + *npages = 0; 2786 + return ret; 2743 2787 } 2744 2788 2745 2789 void *io_mem_alloc(size_t size) ··· 3402 3298 /* Don't allow mmap if the ring was setup without it */ 3403 3299 if (ctx->flags & IORING_SETUP_NO_MMAP) 3404 3300 return ERR_PTR(-EINVAL); 3405 - ptr = ctx->rings; 3406 - break; 3301 + return ctx->rings; 3407 3302 case IORING_OFF_SQES: 3408 3303 /* Don't allow mmap if the ring was setup without it */ 3409 3304 if (ctx->flags & IORING_SETUP_NO_MMAP) 3410 3305 return ERR_PTR(-EINVAL); 3411 - ptr = ctx->sq_sqes; 3412 - break; 3306 + return ctx->sq_sqes; 3413 3307 case IORING_OFF_PBUF_RING: { 3414 3308 struct io_buffer_list *bl; 3415 3309 unsigned int bgid; ··· 3431 3329 return ptr; 3432 3330 } 3433 3331 3332 + int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, 3333 + struct page **pages, int npages) 3334 + { 3335 + unsigned long nr_pages = npages; 3336 + 3337 + vm_flags_set(vma, VM_DONTEXPAND); 3338 + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); 3339 + } 3340 + 3434 3341 #ifdef CONFIG_MMU 3435 3342 3436 3343 static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) 3437 3344 { 3345 + struct io_ring_ctx *ctx = file->private_data; 3438 3346 size_t sz = vma->vm_end - vma->vm_start; 3347 + long offset = vma->vm_pgoff << PAGE_SHIFT; 3439 3348 unsigned long pfn; 3440 3349 void *ptr; 3441 3350 3442 3351 ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); 3443 3352 if (IS_ERR(ptr)) 3444 3353 return PTR_ERR(ptr); 3354 + 3355 + switch (offset & IORING_OFF_MMAP_MASK) { 3356 + case IORING_OFF_SQ_RING: 3357 + case IORING_OFF_CQ_RING: 3358 + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, 3359 + ctx->n_ring_pages); 3360 + case IORING_OFF_SQES: 3361 + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, 3362 + ctx->n_sqe_pages); 3363 + } 3445 3364 3446 3365 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 3447 3366 return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); ··· 3753 3630 return -EOVERFLOW; 3754 3631 3755 3632 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3756 - rings = io_mem_alloc(size); 3633 + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); 3757 3634 else 3758 3635 rings = io_rings_map(ctx, p->cq_off.user_addr, size); 3759 3636 ··· 3778 3655 } 3779 3656 3780 3657 if (!(ctx->flags & IORING_SETUP_NO_MMAP)) 3781 - ptr = io_mem_alloc(size); 3658 + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); 3782 3659 else 3783 3660 ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); 3784 3661
+2
io_uring/io_uring.h
··· 70 70 void __io_commit_cqring_flush(struct io_ring_ctx *ctx); 71 71 72 72 struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); 73 + int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, 74 + struct page **pages, int npages); 73 75 74 76 struct file *io_file_get_normal(struct io_kiocb *req, int fd); 75 77 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,