Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

iommu/amd: Use the generic iommu page table

Replace the io_pgtable versions with pt_iommu versions. The v2 page table
uses the x86 implementation that will be eventually shared with VT-d.

This supports the same special features as the original code:
- increase_top for the v1 format to allow scaling from 3 to 6 levels
- non-present flushing
- Dirty tracking for v1 only
- __sme_set() to adjust the PTEs for CC
- Optimization for flushing with virtualization to minimize the range
- amd_iommu_pgsize_bitmap override of the native page sizes
- page tables allocate from the device's NUMA node

Rework the domain ops so that v1/v2 get their own ops. Make dedicated
allocation functions for v1 and v2. Hook up invalidation for a top change
to struct pt_iommu_flush_ops. Delete some of the iopgtable related code
that becomes unused in this patch. The next patch will delete the rest of
it.

This fixes a race bug in AMD's increase_address_space() implementation. It
stores the top level and top pointer in different memory, which prevents
other threads from reading a coherent version:

increase_address_space() alloc_pte()
level = pgtable->mode - 1;
pgtable->root = pte;
pgtable->mode += 1;
pte = &pgtable->root[PM_LEVEL_INDEX(level, address)];

The iommupt version is careful to put mode and root under a single
READ_ONCE and then is careful to only READ_ONCE a single time per
walk.

Signed-off-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

authored by

Alejandro Jimenez and committed by
Joerg Roedel
789a5913 aef5de75

+282 -276
+4 -1
drivers/iommu/amd/Kconfig
··· 11 11 select MMU_NOTIFIER 12 12 select IOMMU_API 13 13 select IOMMU_IOVA 14 - select IOMMU_IO_PGTABLE 15 14 select IOMMU_SVA 16 15 select IOMMU_IOPF 17 16 select IOMMUFD_DRIVER if IOMMUFD 17 + select GENERIC_PT 18 + select IOMMU_PT 19 + select IOMMU_PT_AMDV1 20 + select IOMMU_PT_X86_64 18 21 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE 19 22 help 20 23 With this option you can enable support for AMD IOMMU hardware in
-1
drivers/iommu/amd/amd_iommu.h
··· 88 88 * the IOMMU used by this driver. 89 89 */ 90 90 void amd_iommu_flush_all_caches(struct amd_iommu *iommu); 91 - void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); 92 91 void amd_iommu_domain_flush_pages(struct protection_domain *domain, 93 92 u64 address, size_t size); 94 93 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,
+10 -2
drivers/iommu/amd/amd_iommu_types.h
··· 19 19 #include <linux/pci.h> 20 20 #include <linux/irqreturn.h> 21 21 #include <linux/io-pgtable.h> 22 + #include <linux/generic_pt/iommu.h> 22 23 23 24 /* 24 25 * Maximum number of IOMMUs supported ··· 590 589 * independent of their use. 591 590 */ 592 591 struct protection_domain { 592 + union { 593 + struct iommu_domain domain; 594 + struct pt_iommu iommu; 595 + struct pt_iommu_amdv1 amdv1; 596 + struct pt_iommu_x86_64 amdv2; 597 + }; 593 598 struct list_head dev_list; /* List of all devices in this domain */ 594 - struct iommu_domain domain; /* generic domain handle used by 595 - iommu core code */ 596 599 struct amd_io_pgtable iop; 597 600 spinlock_t lock; /* mostly used to lock the page table*/ 598 601 u16 id; /* the domain id written to the device table */ ··· 607 602 struct mmu_notifier mn; /* mmu notifier for the SVA domain */ 608 603 struct list_head dev_data_list; /* List of pdom_dev_data */ 609 604 }; 605 + PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); 606 + PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); 607 + PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); 610 608 611 609 /* 612 610 * This structure contains information about one PCI segment in the system.
-2
drivers/iommu/amd/io_pgtable.c
··· 136 136 pgtable->mode += 1; 137 137 write_seqcount_end(&pgtable->seqcount); 138 138 139 - amd_iommu_update_and_flush_device_table(domain); 140 - 141 139 pte = NULL; 142 140 ret = true; 143 141
+268 -270
drivers/iommu/amd/iommu.c
··· 30 30 #include <linux/msi.h> 31 31 #include <linux/irqdomain.h> 32 32 #include <linux/percpu.h> 33 - #include <linux/io-pgtable.h> 34 33 #include <linux/cc_platform.h> 35 34 #include <asm/irq_remapping.h> 36 35 #include <asm/io_apic.h> ··· 40 41 #include <asm/gart.h> 41 42 #include <asm/dma.h> 42 43 #include <uapi/linux/iommufd.h> 44 + #include <linux/generic_pt/iommu.h> 43 45 44 46 #include "amd_iommu.h" 45 - #include "../dma-iommu.h" 46 47 #include "../irq_remapping.h" 47 48 #include "../iommu-pages.h" 48 49 ··· 59 60 LIST_HEAD(acpihid_map); 60 61 61 62 const struct iommu_ops amd_iommu_ops; 62 - static const struct iommu_dirty_ops amd_dirty_ops; 63 63 64 64 int amd_iommu_max_glx_val = -1; 65 65 ··· 72 74 struct iommu_domain *old); 73 75 74 76 static void set_dte_entry(struct amd_iommu *iommu, 75 - struct iommu_dev_data *dev_data); 77 + struct iommu_dev_data *dev_data, 78 + phys_addr_t top_paddr, unsigned int top_level); 79 + 80 + static void amd_iommu_change_top(struct pt_iommu *iommu_table, 81 + phys_addr_t top_paddr, unsigned int top_level); 76 82 77 83 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); 78 84 79 85 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); 86 + static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); 87 + static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 88 + bool enable); 80 89 81 90 /**************************************************************************** 82 91 * ··· 1761 1756 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1762 1757 } 1763 1758 1764 - /* Flush the not present cache if it exists */ 1765 - static void domain_flush_np_cache(struct protection_domain *domain, 1766 - dma_addr_t iova, size_t size) 1767 - { 1768 - if (unlikely(amd_iommu_np_cache)) { 1769 - unsigned long flags; 1770 - 1771 - spin_lock_irqsave(&domain->lock, flags); 1772 - amd_iommu_domain_flush_pages(domain, iova, size); 1773 - spin_unlock_irqrestore(&domain->lock, flags); 1774 - } 1775 - } 1776 - 1777 - 1778 - /* 1779 - * This function flushes the DTEs for all devices in domain 1780 - */ 1781 - void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) 1782 - { 1783 - struct iommu_dev_data *dev_data; 1784 - 1785 - lockdep_assert_held(&domain->lock); 1786 - 1787 - list_for_each_entry(dev_data, &domain->dev_list, list) { 1788 - struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 1789 - 1790 - set_dte_entry(iommu, dev_data); 1791 - clone_aliases(iommu, dev_data->dev); 1792 - } 1793 - 1794 - list_for_each_entry(dev_data, &domain->dev_list, list) 1795 - device_flush_dte(dev_data); 1796 - 1797 - domain_flush_complete(domain); 1798 - } 1799 - 1800 1759 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) 1801 1760 { 1802 1761 struct iommu_dev_data *dev_data; ··· 2020 2051 } 2021 2052 2022 2053 static void set_dte_entry(struct amd_iommu *iommu, 2023 - struct iommu_dev_data *dev_data) 2054 + struct iommu_dev_data *dev_data, 2055 + phys_addr_t top_paddr, unsigned int top_level) 2024 2056 { 2025 2057 u16 domid; 2026 2058 u32 old_domid; ··· 2030 2060 struct protection_domain *domain = dev_data->domain; 2031 2061 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2032 2062 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2033 - 2034 - if (gcr3_info && gcr3_info->gcr3_tbl) 2035 - domid = dev_data->gcr3_info.domid; 2036 - else 2037 - domid = domain->id; 2063 + struct pt_iommu_amdv1_hw_info pt_info; 2038 2064 2039 2065 make_clear_dte(dev_data, dte, &new); 2040 2066 2041 - if (domain->iop.mode != PAGE_MODE_NONE) 2042 - new.data[0] |= iommu_virt_to_phys(domain->iop.root); 2067 + if (gcr3_info && gcr3_info->gcr3_tbl) 2068 + domid = dev_data->gcr3_info.domid; 2069 + else { 2070 + domid = domain->id; 2043 2071 2044 - new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) 2045 - << DEV_ENTRY_MODE_SHIFT; 2072 + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { 2073 + /* 2074 + * When updating the IO pagetable, the new top and level 2075 + * are provided as parameters. For other operations i.e. 2076 + * device attach, retrieve the current pagetable info 2077 + * via the IOMMU PT API. 2078 + */ 2079 + if (top_paddr) { 2080 + pt_info.host_pt_root = top_paddr; 2081 + pt_info.mode = top_level + 1; 2082 + } else { 2083 + WARN_ON(top_paddr || top_level); 2084 + pt_iommu_amdv1_hw_info(&domain->amdv1, 2085 + &pt_info); 2086 + } 2087 + 2088 + new.data[0] |= __sme_set(pt_info.host_pt_root) | 2089 + (pt_info.mode & DEV_ENTRY_MODE_MASK) 2090 + << DEV_ENTRY_MODE_SHIFT; 2091 + } 2092 + } 2046 2093 2047 2094 new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; 2048 2095 ··· 2125 2138 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 2126 2139 2127 2140 if (set) 2128 - set_dte_entry(iommu, dev_data); 2141 + set_dte_entry(iommu, dev_data, 0, 0); 2129 2142 else 2130 2143 clear_dte_entry(iommu, dev_data); 2131 2144 ··· 2143 2156 { 2144 2157 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2145 2158 int max_pasids = dev_data->max_pasids; 2159 + struct pt_iommu_x86_64_hw_info pt_info; 2146 2160 int ret = 0; 2147 2161 2148 2162 /* ··· 2166 2178 if (!pdom_is_v2_pgtbl_mode(pdom)) 2167 2179 return ret; 2168 2180 2169 - ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); 2181 + pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); 2182 + ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); 2170 2183 if (ret) 2171 2184 free_gcr3_table(&dev_data->gcr3_info); 2172 2185 ··· 2489 2500 return domain; 2490 2501 } 2491 2502 2492 - static int pdom_setup_pgtable(struct protection_domain *domain, 2493 - struct device *dev) 2494 - { 2495 - struct io_pgtable_ops *pgtbl_ops; 2496 - enum io_pgtable_fmt fmt; 2497 - 2498 - switch (domain->pd_mode) { 2499 - case PD_MODE_V1: 2500 - fmt = AMD_IOMMU_V1; 2501 - break; 2502 - case PD_MODE_V2: 2503 - fmt = AMD_IOMMU_V2; 2504 - break; 2505 - case PD_MODE_NONE: 2506 - WARN_ON_ONCE(1); 2507 - return -EPERM; 2508 - } 2509 - 2510 - domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev); 2511 - pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain); 2512 - if (!pgtbl_ops) 2513 - return -ENOMEM; 2514 - 2515 - return 0; 2516 - } 2517 - 2518 - static inline u64 dma_max_address(enum protection_domain_mode pgtable) 2519 - { 2520 - if (pgtable == PD_MODE_V1) 2521 - return PM_LEVEL_SIZE(amd_iommu_hpt_level); 2522 - 2523 - /* 2524 - * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page 2525 - * Translation" shows that the V2 table sign extends the top of the 2526 - * address space creating a reserved region in the middle of the 2527 - * translation, just like the CPU does. Further Vasant says the docs are 2528 - * incomplete and this only applies to non-zero PASIDs. If the AMDv2 2529 - * page table is assigned to the 0 PASID then there is no sign extension 2530 - * check. 2531 - * 2532 - * Since the IOMMU must have a fixed geometry, and the core code does 2533 - * not understand sign extended addressing, we have to chop off the high 2534 - * bit to get consistent behavior with attachments of the domain to any 2535 - * PASID. 2536 - */ 2537 - return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1); 2538 - } 2539 - 2540 2503 static bool amd_iommu_hd_support(struct amd_iommu *iommu) 2541 2504 { 2542 2505 if (amd_iommu_hatdis) ··· 2497 2556 return iommu && (iommu->features & FEATURE_HDSUP); 2498 2557 } 2499 2558 2500 - static struct iommu_domain * 2501 - do_iommu_domain_alloc(struct device *dev, u32 flags, 2502 - enum protection_domain_mode pgtable) 2559 + static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) 2503 2560 { 2504 - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 2505 - struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2561 + struct protection_domain *pdom = 2562 + container_of(iommupt, struct protection_domain, iommu); 2563 + 2564 + return &pdom->lock; 2565 + } 2566 + 2567 + /* 2568 + * Update all HW references to the domain with a new pgtable configuration. 2569 + */ 2570 + static void amd_iommu_change_top(struct pt_iommu *iommu_table, 2571 + phys_addr_t top_paddr, unsigned int top_level) 2572 + { 2573 + struct protection_domain *pdom = 2574 + container_of(iommu_table, struct protection_domain, iommu); 2575 + struct iommu_dev_data *dev_data; 2576 + 2577 + lockdep_assert_held(&pdom->lock); 2578 + 2579 + /* Update the DTE for all devices attached to this domain */ 2580 + list_for_each_entry(dev_data, &pdom->dev_list, list) { 2581 + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 2582 + 2583 + /* Update the HW references with the new level and top ptr */ 2584 + set_dte_entry(iommu, dev_data, top_paddr, top_level); 2585 + clone_aliases(iommu, dev_data->dev); 2586 + } 2587 + 2588 + list_for_each_entry(dev_data, &pdom->dev_list, list) 2589 + device_flush_dte(dev_data); 2590 + 2591 + domain_flush_complete(pdom); 2592 + } 2593 + 2594 + /* 2595 + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to 2596 + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non 2597 + * present caching (like hypervisor shadowing). 2598 + */ 2599 + static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2600 + unsigned long iova, size_t size) 2601 + { 2602 + struct protection_domain *domain = to_pdomain(dom); 2603 + unsigned long flags; 2604 + 2605 + if (likely(!amd_iommu_np_cache)) 2606 + return 0; 2607 + 2608 + spin_lock_irqsave(&domain->lock, flags); 2609 + amd_iommu_domain_flush_pages(domain, iova, size); 2610 + spin_unlock_irqrestore(&domain->lock, flags); 2611 + return 0; 2612 + } 2613 + 2614 + static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 2615 + { 2616 + struct protection_domain *dom = to_pdomain(domain); 2617 + unsigned long flags; 2618 + 2619 + spin_lock_irqsave(&dom->lock, flags); 2620 + amd_iommu_domain_flush_all(dom); 2621 + spin_unlock_irqrestore(&dom->lock, flags); 2622 + } 2623 + 2624 + static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 2625 + struct iommu_iotlb_gather *gather) 2626 + { 2627 + struct protection_domain *dom = to_pdomain(domain); 2628 + unsigned long flags; 2629 + 2630 + spin_lock_irqsave(&dom->lock, flags); 2631 + amd_iommu_domain_flush_pages(dom, gather->start, 2632 + gather->end - gather->start + 1); 2633 + spin_unlock_irqrestore(&dom->lock, flags); 2634 + iommu_put_pages_list(&gather->freelist); 2635 + } 2636 + 2637 + static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { 2638 + .get_top_lock = amd_iommu_get_top_lock, 2639 + .change_top = amd_iommu_change_top, 2640 + }; 2641 + 2642 + static const struct iommu_domain_ops amdv1_ops = { 2643 + IOMMU_PT_DOMAIN_OPS(amdv1), 2644 + .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2645 + .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2646 + .iotlb_sync = amd_iommu_iotlb_sync, 2647 + .attach_dev = amd_iommu_attach_device, 2648 + .free = amd_iommu_domain_free, 2649 + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2650 + }; 2651 + 2652 + static const struct iommu_dirty_ops amdv1_dirty_ops = { 2653 + IOMMU_PT_DIRTY_OPS(amdv1), 2654 + .set_dirty_tracking = amd_iommu_set_dirty_tracking, 2655 + }; 2656 + 2657 + static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, 2658 + u32 flags) 2659 + { 2660 + struct pt_iommu_amdv1_cfg cfg = {}; 2506 2661 struct protection_domain *domain; 2507 2662 int ret; 2663 + 2664 + if (amd_iommu_hatdis) 2665 + return ERR_PTR(-EOPNOTSUPP); 2508 2666 2509 2667 domain = protection_domain_alloc(); 2510 2668 if (!domain) 2511 2669 return ERR_PTR(-ENOMEM); 2512 2670 2513 - domain->pd_mode = pgtable; 2514 - ret = pdom_setup_pgtable(domain, dev); 2671 + domain->pd_mode = PD_MODE_V1; 2672 + domain->iommu.driver_ops = &amd_hw_driver_ops_v1; 2673 + domain->iommu.nid = dev_to_node(dev); 2674 + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 2675 + domain->domain.dirty_ops = &amdv1_dirty_ops; 2676 + 2677 + /* 2678 + * Someday FORCE_COHERENCE should be set by 2679 + * amd_iommu_enforce_cache_coherency() like VT-d does. 2680 + */ 2681 + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | 2682 + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | 2683 + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); 2684 + 2685 + /* 2686 + * AMD's IOMMU can flush as many pages as necessary in a single flush. 2687 + * Unless we run in a virtual machine, which can be inferred according 2688 + * to whether "non-present cache" is on, it is probably best to prefer 2689 + * (potentially) too extensive TLB flushing (i.e., more misses) over 2690 + * multiple TLB flushes (i.e., more flushes). For virtual machines the 2691 + * hypervisor needs to synchronize the host IOMMU PTEs with those of 2692 + * the guest, and the trade-off is different: unnecessary TLB flushes 2693 + * should be avoided. 2694 + */ 2695 + if (amd_iommu_np_cache) 2696 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2697 + else 2698 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2699 + 2700 + cfg.common.hw_max_vasz_lg2 = 2701 + min(64, (amd_iommu_hpt_level - 1) * 9 + 21); 2702 + cfg.common.hw_max_oasz_lg2 = 52; 2703 + cfg.starting_level = 2; 2704 + domain->domain.ops = &amdv1_ops; 2705 + 2706 + ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); 2515 2707 if (ret) { 2516 - pdom_id_free(domain->id); 2517 - kfree(domain); 2708 + amd_iommu_domain_free(&domain->domain); 2518 2709 return ERR_PTR(ret); 2519 2710 } 2520 2711 2521 - domain->domain.geometry.aperture_start = 0; 2522 - domain->domain.geometry.aperture_end = dma_max_address(pgtable); 2523 - domain->domain.geometry.force_aperture = true; 2524 - domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; 2712 + /* 2713 + * Narrow the supported page sizes to those selected by the kernel 2714 + * command line. 2715 + */ 2716 + domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; 2717 + return &domain->domain; 2718 + } 2525 2719 2526 - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; 2527 - domain->domain.ops = iommu->iommu.ops->default_domain_ops; 2720 + static const struct iommu_domain_ops amdv2_ops = { 2721 + IOMMU_PT_DOMAIN_OPS(x86_64), 2722 + .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2723 + .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2724 + .iotlb_sync = amd_iommu_iotlb_sync, 2725 + .attach_dev = amd_iommu_attach_device, 2726 + .free = amd_iommu_domain_free, 2727 + /* 2728 + * Note the AMDv2 page table format does not support a Force Coherency 2729 + * bit, so enforce_cache_coherency should not be set. However VFIO is 2730 + * not prepared to handle a case where some domains will support 2731 + * enforcement and others do not. VFIO and iommufd will have to be fixed 2732 + * before it can fully use the V2 page table. See the comment in 2733 + * iommufd_hwpt_paging_alloc(). For now leave things as they have 2734 + * historically been and lie about enforce_cache_coherencey. 2735 + */ 2736 + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2737 + }; 2528 2738 2529 - if (dirty_tracking) 2530 - domain->domain.dirty_ops = &amd_dirty_ops; 2739 + static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, 2740 + u32 flags) 2741 + { 2742 + struct pt_iommu_x86_64_cfg cfg = {}; 2743 + struct protection_domain *domain; 2744 + int ret; 2531 2745 2746 + if (!amd_iommu_v2_pgtbl_supported()) 2747 + return ERR_PTR(-EOPNOTSUPP); 2748 + 2749 + domain = protection_domain_alloc(); 2750 + if (!domain) 2751 + return ERR_PTR(-ENOMEM); 2752 + 2753 + domain->pd_mode = PD_MODE_V2; 2754 + domain->iommu.nid = dev_to_node(dev); 2755 + 2756 + cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); 2757 + if (amd_iommu_np_cache) 2758 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2759 + else 2760 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2761 + 2762 + /* 2763 + * The v2 table behaves differently if it is attached to PASID 0 vs a 2764 + * non-zero PASID. On PASID 0 it has no sign extension and the full 2765 + * 57/48 bits decode the lower addresses. Otherwise it behaves like a 2766 + * normal sign extended x86 page table. Since we want the domain to work 2767 + * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not 2768 + * set which creates a table that is compatible in both modes. 2769 + */ 2770 + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) 2771 + cfg.common.hw_max_vasz_lg2 = 56; 2772 + else 2773 + cfg.common.hw_max_vasz_lg2 = 47; 2774 + cfg.common.hw_max_oasz_lg2 = 52; 2775 + domain->domain.ops = &amdv2_ops; 2776 + 2777 + ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); 2778 + if (ret) { 2779 + amd_iommu_domain_free(&domain->domain); 2780 + return ERR_PTR(ret); 2781 + } 2532 2782 return &domain->domain; 2533 2783 } 2534 2784 ··· 2740 2608 /* Allocate domain with v1 page table for dirty tracking */ 2741 2609 if (!amd_iommu_hd_support(iommu)) 2742 2610 break; 2743 - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); 2611 + return amd_iommu_domain_alloc_paging_v1(dev, flags); 2744 2612 case IOMMU_HWPT_ALLOC_PASID: 2745 2613 /* Allocate domain with v2 page table if IOMMU supports PASID. */ 2746 2614 if (!amd_iommu_pasid_supported()) 2747 2615 break; 2748 - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); 2749 - case 0: 2616 + return amd_iommu_domain_alloc_paging_v2(dev, flags); 2617 + case 0: { 2618 + struct iommu_domain *ret; 2619 + 2750 2620 /* If nothing specific is required use the kernel commandline default */ 2751 - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); 2621 + if (amd_iommu_pgtable == PD_MODE_V1) { 2622 + ret = amd_iommu_domain_alloc_paging_v1(dev, flags); 2623 + if (ret != ERR_PTR(-EOPNOTSUPP)) 2624 + return ret; 2625 + return amd_iommu_domain_alloc_paging_v2(dev, flags); 2626 + } 2627 + ret = amd_iommu_domain_alloc_paging_v2(dev, flags); 2628 + if (ret != ERR_PTR(-EOPNOTSUPP)) 2629 + return ret; 2630 + return amd_iommu_domain_alloc_paging_v1(dev, flags); 2631 + } 2752 2632 default: 2753 2633 break; 2754 2634 } ··· 2772 2628 struct protection_domain *domain = to_pdomain(dom); 2773 2629 2774 2630 WARN_ON(!list_empty(&domain->dev_list)); 2775 - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) 2776 - free_io_pgtable_ops(&domain->iop.pgtbl.ops); 2631 + pt_iommu_deinit(&domain->iommu); 2777 2632 pdom_id_free(domain->id); 2778 2633 kfree(domain); 2779 2634 } ··· 2870 2727 return ret; 2871 2728 } 2872 2729 2873 - static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2874 - unsigned long iova, size_t size) 2875 - { 2876 - struct protection_domain *domain = to_pdomain(dom); 2877 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2878 - 2879 - if (ops->map_pages) 2880 - domain_flush_np_cache(domain, iova, size); 2881 - return 0; 2882 - } 2883 - 2884 - static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, 2885 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 2886 - int iommu_prot, gfp_t gfp, size_t *mapped) 2887 - { 2888 - struct protection_domain *domain = to_pdomain(dom); 2889 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2890 - int prot = 0; 2891 - int ret = -EINVAL; 2892 - 2893 - if ((domain->pd_mode == PD_MODE_V1) && 2894 - (domain->iop.mode == PAGE_MODE_NONE)) 2895 - return -EINVAL; 2896 - 2897 - if (iommu_prot & IOMMU_READ) 2898 - prot |= IOMMU_PROT_IR; 2899 - if (iommu_prot & IOMMU_WRITE) 2900 - prot |= IOMMU_PROT_IW; 2901 - 2902 - if (ops->map_pages) { 2903 - ret = ops->map_pages(ops, iova, paddr, pgsize, 2904 - pgcount, prot, gfp, mapped); 2905 - } 2906 - 2907 - return ret; 2908 - } 2909 - 2910 - static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, 2911 - struct iommu_iotlb_gather *gather, 2912 - unsigned long iova, size_t size) 2913 - { 2914 - /* 2915 - * AMD's IOMMU can flush as many pages as necessary in a single flush. 2916 - * Unless we run in a virtual machine, which can be inferred according 2917 - * to whether "non-present cache" is on, it is probably best to prefer 2918 - * (potentially) too extensive TLB flushing (i.e., more misses) over 2919 - * mutliple TLB flushes (i.e., more flushes). For virtual machines the 2920 - * hypervisor needs to synchronize the host IOMMU PTEs with those of 2921 - * the guest, and the trade-off is different: unnecessary TLB flushes 2922 - * should be avoided. 2923 - */ 2924 - if (amd_iommu_np_cache && 2925 - iommu_iotlb_gather_is_disjoint(gather, iova, size)) 2926 - iommu_iotlb_sync(domain, gather); 2927 - 2928 - iommu_iotlb_gather_add_range(gather, iova, size); 2929 - } 2930 - 2931 - static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, 2932 - size_t pgsize, size_t pgcount, 2933 - struct iommu_iotlb_gather *gather) 2934 - { 2935 - struct protection_domain *domain = to_pdomain(dom); 2936 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2937 - size_t r; 2938 - 2939 - if ((domain->pd_mode == PD_MODE_V1) && 2940 - (domain->iop.mode == PAGE_MODE_NONE)) 2941 - return 0; 2942 - 2943 - r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; 2944 - 2945 - if (r) 2946 - amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); 2947 - 2948 - return r; 2949 - } 2950 - 2951 - static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2952 - dma_addr_t iova) 2953 - { 2954 - struct protection_domain *domain = to_pdomain(dom); 2955 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2956 - 2957 - return ops->iova_to_phys(ops, iova); 2958 - } 2959 - 2960 2730 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) 2961 2731 { 2962 2732 switch (cap) { ··· 2934 2878 spin_unlock_irqrestore(&pdomain->lock, flags); 2935 2879 2936 2880 return 0; 2937 - } 2938 - 2939 - static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, 2940 - unsigned long iova, size_t size, 2941 - unsigned long flags, 2942 - struct iommu_dirty_bitmap *dirty) 2943 - { 2944 - struct protection_domain *pdomain = to_pdomain(domain); 2945 - struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; 2946 - unsigned long lflags; 2947 - 2948 - if (!ops || !ops->read_and_clear_dirty) 2949 - return -EOPNOTSUPP; 2950 - 2951 - spin_lock_irqsave(&pdomain->lock, lflags); 2952 - if (!pdomain->dirty_tracking && dirty->bitmap) { 2953 - spin_unlock_irqrestore(&pdomain->lock, lflags); 2954 - return -EINVAL; 2955 - } 2956 - spin_unlock_irqrestore(&pdomain->lock, lflags); 2957 - 2958 - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); 2959 2881 } 2960 2882 2961 2883 static void amd_iommu_get_resv_regions(struct device *dev, ··· 3005 2971 return dev_data->defer_attach; 3006 2972 } 3007 2973 3008 - static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 3009 - { 3010 - struct protection_domain *dom = to_pdomain(domain); 3011 - unsigned long flags; 3012 - 3013 - spin_lock_irqsave(&dom->lock, flags); 3014 - amd_iommu_domain_flush_all(dom); 3015 - spin_unlock_irqrestore(&dom->lock, flags); 3016 - } 3017 - 3018 - static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 3019 - struct iommu_iotlb_gather *gather) 3020 - { 3021 - struct protection_domain *dom = to_pdomain(domain); 3022 - unsigned long flags; 3023 - 3024 - spin_lock_irqsave(&dom->lock, flags); 3025 - amd_iommu_domain_flush_pages(dom, gather->start, 3026 - gather->end - gather->start + 1); 3027 - spin_unlock_irqrestore(&dom->lock, flags); 3028 - } 3029 - 3030 2974 static int amd_iommu_def_domain_type(struct device *dev) 3031 2975 { 3032 2976 struct iommu_dev_data *dev_data; ··· 3039 3027 return true; 3040 3028 } 3041 3029 3042 - static const struct iommu_dirty_ops amd_dirty_ops = { 3043 - .set_dirty_tracking = amd_iommu_set_dirty_tracking, 3044 - .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, 3045 - }; 3046 - 3047 3030 const struct iommu_ops amd_iommu_ops = { 3048 3031 .capable = amd_iommu_capable, 3049 3032 .blocked_domain = &blocked_domain, ··· 3053 3046 .is_attach_deferred = amd_iommu_is_attach_deferred, 3054 3047 .def_domain_type = amd_iommu_def_domain_type, 3055 3048 .page_response = amd_iommu_page_response, 3056 - .default_domain_ops = &(const struct iommu_domain_ops) { 3057 - .attach_dev = amd_iommu_attach_device, 3058 - .map_pages = amd_iommu_map_pages, 3059 - .unmap_pages = amd_iommu_unmap_pages, 3060 - .iotlb_sync_map = amd_iommu_iotlb_sync_map, 3061 - .iova_to_phys = amd_iommu_iova_to_phys, 3062 - .flush_iotlb_all = amd_iommu_flush_iotlb_all, 3063 - .iotlb_sync = amd_iommu_iotlb_sync, 3064 - .free = amd_iommu_domain_free, 3065 - .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 3066 - } 3067 3049 }; 3068 3050 3069 3051 #ifdef CONFIG_IRQ_REMAP ··· 4061 4065 return 0; 4062 4066 } 4063 4067 #endif 4068 + 4069 + MODULE_IMPORT_NS("GENERIC_PT_IOMMU");