iommupt: Avoid rewalking during map · tjh.dev/kernel@d6c65b0

+82 -51

drivers/iommu/generic_pt/iommu_pt.h

··· 477 477 pt_oaddr_t oa; 478 478 unsigned int leaf_pgsize_lg2; 479 479 unsigned int leaf_level; 480 + pt_vaddr_t num_leaves; 480 481 }; 481 482 482 483 /* ··· 530 529 static int __map_range_leaf(struct pt_range *range, void *arg, 531 530 unsigned int level, struct pt_table_p *table) 532 531 { 532 + struct pt_iommu *iommu_table = iommu_from_common(range->common); 533 533 struct pt_state pts = pt_init(range, level, table); 534 534 struct pt_iommu_map_args *map = arg; 535 535 unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; 536 536 unsigned int start_index; 537 537 pt_oaddr_t oa = map->oa; 538 + unsigned int num_leaves; 539 + unsigned int orig_end; 540 + pt_vaddr_t last_va; 538 541 unsigned int step; 539 542 bool need_contig; 540 543 int ret = 0; ··· 552 547 553 548 _pt_iter_first(&pts); 554 549 start_index = pts.index; 550 + orig_end = pts.end_index; 551 + if (pts.index + map->num_leaves < pts.end_index) { 552 + /* Need to stop in the middle of the table to change sizes */ 553 + pts.end_index = pts.index + map->num_leaves; 554 + num_leaves = 0; 555 + } else { 556 + num_leaves = map->num_leaves - (pts.end_index - pts.index); 557 + } 558 + 555 559 do { 556 560 pts.type = pt_load_entry_raw(&pts); 557 561 if (pts.type != PT_ENTRY_EMPTY || need_contig) { ··· 586 572 flush_writes_range(&pts, start_index, pts.index); 587 573 588 574 map->oa = oa; 589 - return ret; 575 + map->num_leaves = num_leaves; 576 + if (ret || num_leaves) 577 + return ret; 578 + 579 + /* range->va is not valid if we reached the end of the table */ 580 + pts.index -= step; 581 + pt_index_to_va(&pts); 582 + pts.index += step; 583 + last_va = range->va + log2_to_int(leaf_pgsize_lg2); 584 + 585 + if (last_va - 1 == range->last_va) { 586 + PT_WARN_ON(pts.index != orig_end); 587 + return 0; 588 + } 589 + 590 + /* 591 + * Reached a point where the page size changed, compute the new 592 + * parameters. 593 + */ 594 + map->leaf_pgsize_lg2 = pt_compute_best_pgsize( 595 + iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa); 596 + map->leaf_level = 597 + pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2); 598 + map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap, 599 + last_va, range->last_va, oa, 600 + map->leaf_pgsize_lg2); 601 + 602 + /* Didn't finish this table level, caller will repeat it */ 603 + if (pts.index != orig_end) { 604 + if (pts.index != start_index) 605 + pt_index_to_va(&pts); 606 + return -EAGAIN; 607 + } 608 + return 0; 590 609 } 591 610 592 611 static int __map_range(struct pt_range *range, void *arg, unsigned int level, ··· 642 595 if (pts.type != PT_ENTRY_EMPTY) 643 596 return -EADDRINUSE; 644 597 ret = pt_iommu_new_table(&pts, &map->attrs); 645 - if (ret) { 646 - /* 647 - * Racing with another thread installing a table 648 - */ 649 - if (ret == -EAGAIN) 650 - continue; 598 + /* EAGAIN on a race will loop again */ 599 + if (ret) 651 600 return ret; 652 - } 653 601 } else { 654 602 pts.table_lower = pt_table_ptr(&pts); 655 603 /* ··· 668 626 * The already present table can possibly be shared with another 669 627 * concurrent map. 670 628 */ 671 - if (map->leaf_level == level - 1) 672 - ret = pt_descend(&pts, arg, __map_range_leaf); 673 - else 674 - ret = pt_descend(&pts, arg, __map_range); 629 + do { 630 + if (map->leaf_level == level - 1) 631 + ret = pt_descend(&pts, arg, __map_range_leaf); 632 + else 633 + ret = pt_descend(&pts, arg, __map_range); 634 + } while (ret == -EAGAIN); 675 635 if (ret) 676 636 return ret; 677 637 ··· 681 637 pt_index_to_va(&pts); 682 638 if (pts.index >= pts.end_index) 683 639 break; 640 + 641 + /* 642 + * This level is currently running __map_range_leaf() which is 643 + * not correct if the target level has been updated to this 644 + * level. Have the caller invoke __map_range_leaf. 645 + */ 646 + if (map->leaf_level == level) 647 + return -EAGAIN; 684 648 } while (true); 685 649 return 0; 686 650 } ··· 860 808 static int do_map(struct pt_range *range, struct pt_common *common, 861 809 bool single_page, struct pt_iommu_map_args *map) 862 810 { 811 + int ret; 812 + 863 813 /* 864 814 * The __map_single_page() fast path does not support DMA_INCOHERENT 865 815 * flushing to keep its .text small. 866 816 */ 867 817 if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { 868 - int ret; 869 818 870 819 ret = pt_walk_range(range, __map_single_page, map); 871 820 if (ret != -EAGAIN) ··· 874 821 /* EAGAIN falls through to the full path */ 875 822 } 876 823 877 - if (map->leaf_level == range->top_level) 878 - return pt_walk_range(range, __map_range_leaf, map); 879 - return pt_walk_range(range, __map_range, map); 824 + do { 825 + if (map->leaf_level == range->top_level) 826 + ret = pt_walk_range(range, __map_range_leaf, map); 827 + else 828 + ret = pt_walk_range(range, __map_range, map); 829 + } while (ret == -EAGAIN); 830 + return ret; 880 831 } 881 832 882 - /** 883 - * map_pages() - Install translation for an IOVA range 884 - * @domain: Domain to manipulate 885 - * @iova: IO virtual address to start 886 - * @paddr: Physical/Output address to start 887 - * @pgsize: Length of each page 888 - * @pgcount: Length of the range in pgsize units starting from @iova 889 - * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 890 - * @gfp: GFP flags for any memory allocations 891 - * @mapped: Total bytes successfully mapped 892 - * 893 - * The range starting at IOVA will have paddr installed into it. The caller 894 - * must specify a valid pgsize and pgcount to segment the range into compatible 895 - * blocks. 896 - * 897 - * On error the caller will probably want to invoke unmap on the range from iova 898 - * up to the amount indicated by @mapped to return the table back to an 899 - * unchanged state. 900 - * 901 - * Context: The caller must hold a write range lock that includes the whole 902 - * range. 903 - * 904 - * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were 905 - * mapped are added to @mapped, @mapped is not zerod first. 906 - */ 907 - int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, 908 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 909 - int prot, gfp_t gfp, size_t *mapped) 833 + static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 834 + phys_addr_t paddr, dma_addr_t len, unsigned int prot, 835 + gfp_t gfp, size_t *mapped) 910 836 { 911 - struct pt_iommu *iommu_table = 912 - container_of(domain, struct pt_iommu, domain); 913 837 pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; 914 838 struct pt_common *common = common_from_iommu(iommu_table); 915 839 struct iommu_iotlb_gather iotlb_gather; 916 - pt_vaddr_t len = pgsize * pgcount; 917 840 struct pt_iommu_map_args map = { 918 841 .iotlb_gather = &iotlb_gather, 919 842 .oa = paddr, 920 - .leaf_pgsize_lg2 = vaffs(pgsize), 921 843 }; 922 844 bool single_page = false; 923 845 struct pt_range range; ··· 920 892 return ret; 921 893 922 894 /* Calculate target page size and level for the leaves */ 923 - if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && 924 - pgcount == 1) { 895 + if (pt_has_system_page_size(common) && len == PAGE_SIZE) { 925 896 PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); 926 897 if (log2_mod(iova | paddr, PAGE_SHIFT)) 927 898 return -ENXIO; 928 899 map.leaf_pgsize_lg2 = PAGE_SHIFT; 929 900 map.leaf_level = 0; 901 + map.num_leaves = 1; 930 902 single_page = true; 931 903 } else { 932 904 map.leaf_pgsize_lg2 = pt_compute_best_pgsize( ··· 935 907 return -ENXIO; 936 908 map.leaf_level = 937 909 pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); 910 + map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va, 911 + range.last_va, paddr, 912 + map.leaf_pgsize_lg2); 938 913 } 939 914 940 915 ret = check_map_range(iommu_table, &range, &map); ··· 960 929 *mapped += map.oa - paddr; 961 930 return ret; 962 931 } 963 - EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); 964 932 965 933 struct pt_unmap_args { 966 934 struct iommu_pages_list free_list; ··· 1128 1098 } 1129 1099 1130 1100 static const struct pt_iommu_ops NS(ops) = { 1101 + .map_range = NS(map_range), 1131 1102 .unmap_range = NS(unmap_range), 1132 1103 #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ 1133 1104 IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)

+12

drivers/iommu/generic_pt/kunit_generic_pt.h

··· 312 312 } 313 313 } 314 314 315 + static void test_pgsz_count(struct kunit *test) 316 + { 317 + KUNIT_EXPECT_EQ(test, 318 + pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)), 319 + SZ_1G / SZ_4K); 320 + KUNIT_EXPECT_EQ(test, 321 + pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K, 322 + ilog2(SZ_4K)), 323 + (SZ_2M - SZ_4K) / SZ_4K); 324 + } 325 + 315 326 /* 316 327 * Check that pt_install_table() and pt_table_pa() match 317 328 */ ··· 781 770 KUNIT_CASE_FMT(test_init), 782 771 KUNIT_CASE_FMT(test_bitops), 783 772 KUNIT_CASE_FMT(test_best_pgsize), 773 + KUNIT_CASE_FMT(test_pgsz_count), 784 774 KUNIT_CASE_FMT(test_table_ptr), 785 775 KUNIT_CASE_FMT(test_max_va), 786 776 KUNIT_CASE_FMT(test_table_radix),

+22

drivers/iommu/generic_pt/pt_iter.h

··· 569 569 return pgsz_lg2; 570 570 } 571 571 572 + /* 573 + * Return the number of pgsize_lg2 leaf entries that can be mapped for 574 + * va to oa. This accounts for any requirement to reduce or increase the page 575 + * size across the VA range. 576 + */ 577 + static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, 578 + pt_vaddr_t last_va, pt_oaddr_t oa, 579 + unsigned int pgsize_lg2) 580 + { 581 + pt_vaddr_t len = last_va - va + 1; 582 + pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1); 583 + 584 + if (next_pgsizes) { 585 + unsigned int next_pgsize_lg2 = vaffs(next_pgsizes); 586 + 587 + if (log2_mod(va ^ oa, next_pgsize_lg2) == 0) 588 + len = min(len, log2_set_mod_max(va, next_pgsize_lg2) - 589 + va + 1); 590 + } 591 + return log2_div(len, pgsize_lg2); 592 + } 593 + 572 594 #define _PT_MAKE_CALL_LEVEL(fn) \ 573 595 static __always_inline int fn(struct pt_range *range, void *arg, \ 574 596 unsigned int level, \

+31 -8

drivers/iommu/iommu.c

··· 2569 2569 return pgsize; 2570 2570 } 2571 2571 2572 - int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, 2573 - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 2572 + static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, 2573 + unsigned long iova, phys_addr_t paddr, 2574 + size_t size, int prot, gfp_t gfp) 2574 2575 { 2575 2576 const struct iommu_domain_ops *ops = domain->ops; 2576 2577 unsigned long orig_iova = iova; 2577 2578 unsigned int min_pagesz; 2578 2579 size_t orig_size = size; 2579 - phys_addr_t orig_paddr = paddr; 2580 2580 int ret = 0; 2581 2581 2582 2582 might_sleep_if(gfpflags_allow_blocking(gfp)); ··· 2633 2633 /* unroll mapping in case something went wrong */ 2634 2634 if (ret) { 2635 2635 iommu_unmap(domain, orig_iova, orig_size - size); 2636 - } else { 2637 - trace_map(orig_iova, orig_paddr, orig_size); 2638 - iommu_debug_map(domain, orig_paddr, orig_size); 2636 + return ret; 2639 2637 } 2640 - 2641 - return ret; 2638 + return 0; 2642 2639 } 2643 2640 2644 2641 int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) ··· 2645 2648 if (!ops->iotlb_sync_map) 2646 2649 return 0; 2647 2650 return ops->iotlb_sync_map(domain, iova, size); 2651 + } 2652 + 2653 + int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, 2654 + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 2655 + { 2656 + struct pt_iommu *pt = iommupt_from_domain(domain); 2657 + int ret; 2658 + 2659 + if (pt) { 2660 + size_t mapped = 0; 2661 + 2662 + ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, 2663 + &mapped); 2664 + if (ret) { 2665 + iommu_unmap(domain, iova, mapped); 2666 + return ret; 2667 + } 2668 + return 0; 2669 + } 2670 + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); 2671 + if (!ret) 2672 + return ret; 2673 + 2674 + trace_map(iova, paddr, size); 2675 + iommu_debug_map(domain, paddr, size); 2676 + return 0; 2648 2677 } 2649 2678 2650 2679 int iommu_map(struct iommu_domain *domain, unsigned long iova,

+28 -6

include/linux/generic_pt/iommu.h

··· 88 88 89 89 struct pt_iommu_ops { 90 90 /** 91 + * @map_range: Install translation for an IOVA range 92 + * @iommu_table: Table to manipulate 93 + * @iova: IO virtual address to start 94 + * @paddr: Physical/Output address to start 95 + * @len: Length of the range starting from @iova 96 + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 97 + * @gfp: GFP flags for any memory allocations 98 + * 99 + * The range starting at IOVA will have paddr installed into it. The 100 + * rage is automatically segmented into optimally sized table entries, 101 + * and can have any valid alignment. 102 + * 103 + * On error the caller will probably want to invoke unmap on the range 104 + * from iova up to the amount indicated by @mapped to return the table 105 + * back to an unchanged state. 106 + * 107 + * Context: The caller must hold a write range lock that includes 108 + * the whole range. 109 + * 110 + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA 111 + * that were mapped are added to @mapped, @mapped is not zerod first. 112 + */ 113 + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 114 + phys_addr_t paddr, dma_addr_t len, unsigned int prot, 115 + gfp_t gfp, size_t *mapped); 116 + 117 + /** 91 118 * @unmap_range: Make a range of IOVA empty/not present 92 119 * @iommu_table: Table to manipulate 93 120 * @iova: IO virtual address to start ··· 251 224 #define IOMMU_PROTOTYPES(fmt) \ 252 225 phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ 253 226 dma_addr_t iova); \ 254 - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ 255 - unsigned long iova, phys_addr_t paddr, \ 256 - size_t pgsize, size_t pgcount, \ 257 - int prot, gfp_t gfp, size_t *mapped); \ 258 227 int pt_iommu_##fmt##_read_and_clear_dirty( \ 259 228 struct iommu_domain *domain, unsigned long iova, size_t size, \ 260 229 unsigned long flags, struct iommu_dirty_bitmap *dirty); \ ··· 271 248 * iommu_pt 272 249 */ 273 250 #define IOMMU_PT_DOMAIN_OPS(fmt) \ 274 - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ 275 - .map_pages = &pt_iommu_##fmt##_map_pages 251 + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys 276 252 #define IOMMU_PT_DIRTY_OPS(fmt) \ 277 253 .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty 278 254

Configure Feed

Configure Feed