Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

iommupt: Avoid rewalking during map

Currently the core code provides a simplified interface to drivers where
it fragments a requested multi-page map into single page size steps after
doing all the calculations to figure out what page size is
appropriate. Each step rewalks the page tables from the start.

Since iommupt has a single implementation of the mapping algorithm it can
internally compute each step as it goes while retaining its current
position in the walk.

Add a new function pt_pgsz_count() which computes the same page size
fragement of a large mapping operations.

Compute the next fragment when all the leaf entries of the current
fragement have been written, then continue walking from the current
point.

The function pointer is run through pt_iommu_ops instead of
iommu_domain_ops to discourage using it outside iommupt. All drivers with
their own page tables should continue to use the simplified map_pages()
style interfaces.

Reviewed-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

authored by

Jason Gunthorpe and committed by
Joerg Roedel
d6c65b0f 99fb8afa

+175 -65
+82 -51
drivers/iommu/generic_pt/iommu_pt.h
··· 477 477 pt_oaddr_t oa; 478 478 unsigned int leaf_pgsize_lg2; 479 479 unsigned int leaf_level; 480 + pt_vaddr_t num_leaves; 480 481 }; 481 482 482 483 /* ··· 530 529 static int __map_range_leaf(struct pt_range *range, void *arg, 531 530 unsigned int level, struct pt_table_p *table) 532 531 { 532 + struct pt_iommu *iommu_table = iommu_from_common(range->common); 533 533 struct pt_state pts = pt_init(range, level, table); 534 534 struct pt_iommu_map_args *map = arg; 535 535 unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; 536 536 unsigned int start_index; 537 537 pt_oaddr_t oa = map->oa; 538 + unsigned int num_leaves; 539 + unsigned int orig_end; 540 + pt_vaddr_t last_va; 538 541 unsigned int step; 539 542 bool need_contig; 540 543 int ret = 0; ··· 552 547 553 548 _pt_iter_first(&pts); 554 549 start_index = pts.index; 550 + orig_end = pts.end_index; 551 + if (pts.index + map->num_leaves < pts.end_index) { 552 + /* Need to stop in the middle of the table to change sizes */ 553 + pts.end_index = pts.index + map->num_leaves; 554 + num_leaves = 0; 555 + } else { 556 + num_leaves = map->num_leaves - (pts.end_index - pts.index); 557 + } 558 + 555 559 do { 556 560 pts.type = pt_load_entry_raw(&pts); 557 561 if (pts.type != PT_ENTRY_EMPTY || need_contig) { ··· 586 572 flush_writes_range(&pts, start_index, pts.index); 587 573 588 574 map->oa = oa; 589 - return ret; 575 + map->num_leaves = num_leaves; 576 + if (ret || num_leaves) 577 + return ret; 578 + 579 + /* range->va is not valid if we reached the end of the table */ 580 + pts.index -= step; 581 + pt_index_to_va(&pts); 582 + pts.index += step; 583 + last_va = range->va + log2_to_int(leaf_pgsize_lg2); 584 + 585 + if (last_va - 1 == range->last_va) { 586 + PT_WARN_ON(pts.index != orig_end); 587 + return 0; 588 + } 589 + 590 + /* 591 + * Reached a point where the page size changed, compute the new 592 + * parameters. 593 + */ 594 + map->leaf_pgsize_lg2 = pt_compute_best_pgsize( 595 + iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa); 596 + map->leaf_level = 597 + pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2); 598 + map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap, 599 + last_va, range->last_va, oa, 600 + map->leaf_pgsize_lg2); 601 + 602 + /* Didn't finish this table level, caller will repeat it */ 603 + if (pts.index != orig_end) { 604 + if (pts.index != start_index) 605 + pt_index_to_va(&pts); 606 + return -EAGAIN; 607 + } 608 + return 0; 590 609 } 591 610 592 611 static int __map_range(struct pt_range *range, void *arg, unsigned int level, ··· 642 595 if (pts.type != PT_ENTRY_EMPTY) 643 596 return -EADDRINUSE; 644 597 ret = pt_iommu_new_table(&pts, &map->attrs); 645 - if (ret) { 646 - /* 647 - * Racing with another thread installing a table 648 - */ 649 - if (ret == -EAGAIN) 650 - continue; 598 + /* EAGAIN on a race will loop again */ 599 + if (ret) 651 600 return ret; 652 - } 653 601 } else { 654 602 pts.table_lower = pt_table_ptr(&pts); 655 603 /* ··· 668 626 * The already present table can possibly be shared with another 669 627 * concurrent map. 670 628 */ 671 - if (map->leaf_level == level - 1) 672 - ret = pt_descend(&pts, arg, __map_range_leaf); 673 - else 674 - ret = pt_descend(&pts, arg, __map_range); 629 + do { 630 + if (map->leaf_level == level - 1) 631 + ret = pt_descend(&pts, arg, __map_range_leaf); 632 + else 633 + ret = pt_descend(&pts, arg, __map_range); 634 + } while (ret == -EAGAIN); 675 635 if (ret) 676 636 return ret; 677 637 ··· 681 637 pt_index_to_va(&pts); 682 638 if (pts.index >= pts.end_index) 683 639 break; 640 + 641 + /* 642 + * This level is currently running __map_range_leaf() which is 643 + * not correct if the target level has been updated to this 644 + * level. Have the caller invoke __map_range_leaf. 645 + */ 646 + if (map->leaf_level == level) 647 + return -EAGAIN; 684 648 } while (true); 685 649 return 0; 686 650 } ··· 860 808 static int do_map(struct pt_range *range, struct pt_common *common, 861 809 bool single_page, struct pt_iommu_map_args *map) 862 810 { 811 + int ret; 812 + 863 813 /* 864 814 * The __map_single_page() fast path does not support DMA_INCOHERENT 865 815 * flushing to keep its .text small. 866 816 */ 867 817 if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { 868 - int ret; 869 818 870 819 ret = pt_walk_range(range, __map_single_page, map); 871 820 if (ret != -EAGAIN) ··· 874 821 /* EAGAIN falls through to the full path */ 875 822 } 876 823 877 - if (map->leaf_level == range->top_level) 878 - return pt_walk_range(range, __map_range_leaf, map); 879 - return pt_walk_range(range, __map_range, map); 824 + do { 825 + if (map->leaf_level == range->top_level) 826 + ret = pt_walk_range(range, __map_range_leaf, map); 827 + else 828 + ret = pt_walk_range(range, __map_range, map); 829 + } while (ret == -EAGAIN); 830 + return ret; 880 831 } 881 832 882 - /** 883 - * map_pages() - Install translation for an IOVA range 884 - * @domain: Domain to manipulate 885 - * @iova: IO virtual address to start 886 - * @paddr: Physical/Output address to start 887 - * @pgsize: Length of each page 888 - * @pgcount: Length of the range in pgsize units starting from @iova 889 - * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 890 - * @gfp: GFP flags for any memory allocations 891 - * @mapped: Total bytes successfully mapped 892 - * 893 - * The range starting at IOVA will have paddr installed into it. The caller 894 - * must specify a valid pgsize and pgcount to segment the range into compatible 895 - * blocks. 896 - * 897 - * On error the caller will probably want to invoke unmap on the range from iova 898 - * up to the amount indicated by @mapped to return the table back to an 899 - * unchanged state. 900 - * 901 - * Context: The caller must hold a write range lock that includes the whole 902 - * range. 903 - * 904 - * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were 905 - * mapped are added to @mapped, @mapped is not zerod first. 906 - */ 907 - int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, 908 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 909 - int prot, gfp_t gfp, size_t *mapped) 833 + static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 834 + phys_addr_t paddr, dma_addr_t len, unsigned int prot, 835 + gfp_t gfp, size_t *mapped) 910 836 { 911 - struct pt_iommu *iommu_table = 912 - container_of(domain, struct pt_iommu, domain); 913 837 pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; 914 838 struct pt_common *common = common_from_iommu(iommu_table); 915 839 struct iommu_iotlb_gather iotlb_gather; 916 - pt_vaddr_t len = pgsize * pgcount; 917 840 struct pt_iommu_map_args map = { 918 841 .iotlb_gather = &iotlb_gather, 919 842 .oa = paddr, 920 - .leaf_pgsize_lg2 = vaffs(pgsize), 921 843 }; 922 844 bool single_page = false; 923 845 struct pt_range range; ··· 920 892 return ret; 921 893 922 894 /* Calculate target page size and level for the leaves */ 923 - if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && 924 - pgcount == 1) { 895 + if (pt_has_system_page_size(common) && len == PAGE_SIZE) { 925 896 PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); 926 897 if (log2_mod(iova | paddr, PAGE_SHIFT)) 927 898 return -ENXIO; 928 899 map.leaf_pgsize_lg2 = PAGE_SHIFT; 929 900 map.leaf_level = 0; 901 + map.num_leaves = 1; 930 902 single_page = true; 931 903 } else { 932 904 map.leaf_pgsize_lg2 = pt_compute_best_pgsize( ··· 935 907 return -ENXIO; 936 908 map.leaf_level = 937 909 pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); 910 + map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va, 911 + range.last_va, paddr, 912 + map.leaf_pgsize_lg2); 938 913 } 939 914 940 915 ret = check_map_range(iommu_table, &range, &map); ··· 960 929 *mapped += map.oa - paddr; 961 930 return ret; 962 931 } 963 - EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); 964 932 965 933 struct pt_unmap_args { 966 934 struct iommu_pages_list free_list; ··· 1128 1098 } 1129 1099 1130 1100 static const struct pt_iommu_ops NS(ops) = { 1101 + .map_range = NS(map_range), 1131 1102 .unmap_range = NS(unmap_range), 1132 1103 #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ 1133 1104 IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty)
+12
drivers/iommu/generic_pt/kunit_generic_pt.h
··· 312 312 } 313 313 } 314 314 315 + static void test_pgsz_count(struct kunit *test) 316 + { 317 + KUNIT_EXPECT_EQ(test, 318 + pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)), 319 + SZ_1G / SZ_4K); 320 + KUNIT_EXPECT_EQ(test, 321 + pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K, 322 + ilog2(SZ_4K)), 323 + (SZ_2M - SZ_4K) / SZ_4K); 324 + } 325 + 315 326 /* 316 327 * Check that pt_install_table() and pt_table_pa() match 317 328 */ ··· 781 770 KUNIT_CASE_FMT(test_init), 782 771 KUNIT_CASE_FMT(test_bitops), 783 772 KUNIT_CASE_FMT(test_best_pgsize), 773 + KUNIT_CASE_FMT(test_pgsz_count), 784 774 KUNIT_CASE_FMT(test_table_ptr), 785 775 KUNIT_CASE_FMT(test_max_va), 786 776 KUNIT_CASE_FMT(test_table_radix),
+22
drivers/iommu/generic_pt/pt_iter.h
··· 569 569 return pgsz_lg2; 570 570 } 571 571 572 + /* 573 + * Return the number of pgsize_lg2 leaf entries that can be mapped for 574 + * va to oa. This accounts for any requirement to reduce or increase the page 575 + * size across the VA range. 576 + */ 577 + static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, 578 + pt_vaddr_t last_va, pt_oaddr_t oa, 579 + unsigned int pgsize_lg2) 580 + { 581 + pt_vaddr_t len = last_va - va + 1; 582 + pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1); 583 + 584 + if (next_pgsizes) { 585 + unsigned int next_pgsize_lg2 = vaffs(next_pgsizes); 586 + 587 + if (log2_mod(va ^ oa, next_pgsize_lg2) == 0) 588 + len = min(len, log2_set_mod_max(va, next_pgsize_lg2) - 589 + va + 1); 590 + } 591 + return log2_div(len, pgsize_lg2); 592 + } 593 + 572 594 #define _PT_MAKE_CALL_LEVEL(fn) \ 573 595 static __always_inline int fn(struct pt_range *range, void *arg, \ 574 596 unsigned int level, \
+31 -8
drivers/iommu/iommu.c
··· 2569 2569 return pgsize; 2570 2570 } 2571 2571 2572 - int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, 2573 - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 2572 + static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, 2573 + unsigned long iova, phys_addr_t paddr, 2574 + size_t size, int prot, gfp_t gfp) 2574 2575 { 2575 2576 const struct iommu_domain_ops *ops = domain->ops; 2576 2577 unsigned long orig_iova = iova; 2577 2578 unsigned int min_pagesz; 2578 2579 size_t orig_size = size; 2579 - phys_addr_t orig_paddr = paddr; 2580 2580 int ret = 0; 2581 2581 2582 2582 might_sleep_if(gfpflags_allow_blocking(gfp)); ··· 2633 2633 /* unroll mapping in case something went wrong */ 2634 2634 if (ret) { 2635 2635 iommu_unmap(domain, orig_iova, orig_size - size); 2636 - } else { 2637 - trace_map(orig_iova, orig_paddr, orig_size); 2638 - iommu_debug_map(domain, orig_paddr, orig_size); 2636 + return ret; 2639 2637 } 2640 - 2641 - return ret; 2638 + return 0; 2642 2639 } 2643 2640 2644 2641 int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) ··· 2645 2648 if (!ops->iotlb_sync_map) 2646 2649 return 0; 2647 2650 return ops->iotlb_sync_map(domain, iova, size); 2651 + } 2652 + 2653 + int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, 2654 + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 2655 + { 2656 + struct pt_iommu *pt = iommupt_from_domain(domain); 2657 + int ret; 2658 + 2659 + if (pt) { 2660 + size_t mapped = 0; 2661 + 2662 + ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, 2663 + &mapped); 2664 + if (ret) { 2665 + iommu_unmap(domain, iova, mapped); 2666 + return ret; 2667 + } 2668 + return 0; 2669 + } 2670 + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); 2671 + if (!ret) 2672 + return ret; 2673 + 2674 + trace_map(iova, paddr, size); 2675 + iommu_debug_map(domain, paddr, size); 2676 + return 0; 2648 2677 } 2649 2678 2650 2679 int iommu_map(struct iommu_domain *domain, unsigned long iova,
+28 -6
include/linux/generic_pt/iommu.h
··· 88 88 89 89 struct pt_iommu_ops { 90 90 /** 91 + * @map_range: Install translation for an IOVA range 92 + * @iommu_table: Table to manipulate 93 + * @iova: IO virtual address to start 94 + * @paddr: Physical/Output address to start 95 + * @len: Length of the range starting from @iova 96 + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 97 + * @gfp: GFP flags for any memory allocations 98 + * 99 + * The range starting at IOVA will have paddr installed into it. The 100 + * rage is automatically segmented into optimally sized table entries, 101 + * and can have any valid alignment. 102 + * 103 + * On error the caller will probably want to invoke unmap on the range 104 + * from iova up to the amount indicated by @mapped to return the table 105 + * back to an unchanged state. 106 + * 107 + * Context: The caller must hold a write range lock that includes 108 + * the whole range. 109 + * 110 + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA 111 + * that were mapped are added to @mapped, @mapped is not zerod first. 112 + */ 113 + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 114 + phys_addr_t paddr, dma_addr_t len, unsigned int prot, 115 + gfp_t gfp, size_t *mapped); 116 + 117 + /** 91 118 * @unmap_range: Make a range of IOVA empty/not present 92 119 * @iommu_table: Table to manipulate 93 120 * @iova: IO virtual address to start ··· 251 224 #define IOMMU_PROTOTYPES(fmt) \ 252 225 phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ 253 226 dma_addr_t iova); \ 254 - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ 255 - unsigned long iova, phys_addr_t paddr, \ 256 - size_t pgsize, size_t pgcount, \ 257 - int prot, gfp_t gfp, size_t *mapped); \ 258 227 int pt_iommu_##fmt##_read_and_clear_dirty( \ 259 228 struct iommu_domain *domain, unsigned long iova, size_t size, \ 260 229 unsigned long flags, struct iommu_dirty_bitmap *dirty); \ ··· 271 248 * iommu_pt 272 249 */ 273 250 #define IOMMU_PT_DOMAIN_OPS(fmt) \ 274 - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ 275 - .map_pages = &pt_iommu_##fmt##_map_pages 251 + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys 276 252 #define IOMMU_PT_DIRTY_OPS(fmt) \ 277 253 .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty 278 254