Merge tag 'iommu-updates-v7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux

+9

Documentation/admin-guide/kernel-parameters.txt

··· 2680 2680 1 - Bypass the IOMMU for DMA. 2681 2681 unset - Use value of CONFIG_IOMMU_DEFAULT_PASSTHROUGH. 2682 2682 2683 + iommu.debug_pagealloc= 2684 + [KNL,EARLY] When CONFIG_IOMMU_DEBUG_PAGEALLOC is set, this 2685 + parameter enables the feature at boot time. By default, it 2686 + is disabled and the system behaves the same way as a kernel 2687 + built without CONFIG_IOMMU_DEBUG_PAGEALLOC. 2688 + Format: { "0" | "1" } 2689 + 0 - Sanitizer disabled. 2690 + 1 - Sanitizer enabled, expect runtime overhead. 2691 + 2683 2692 io7= [HW] IO7 for Marvel-based Alpha systems 2684 2693 See comment before marvel_specify_io7 in 2685 2694 arch/alpha/kernel/core_marvel.c.

+1

MAINTAINERS

··· 13346 13346 F: include/linux/iommu.h 13347 13347 F: include/linux/iova.h 13348 13348 F: include/linux/of_iommu.h 13349 + F: rust/kernel/iommu/ 13349 13350 13350 13351 IOMMUFD 13351 13352 M: Jason Gunthorpe <jgg@nvidia.com>

+19

drivers/iommu/Kconfig

··· 384 384 385 385 Say Y here if you want to use the multimedia devices listed above. 386 386 387 + config IOMMU_DEBUG_PAGEALLOC 388 + bool "Debug IOMMU mappings against page allocations" 389 + depends on DEBUG_PAGEALLOC && IOMMU_API && PAGE_EXTENSION 390 + help 391 + This enables a consistency check between the kernel page allocator and 392 + the IOMMU subsystem. It verifies that pages being allocated or freed 393 + are not currently mapped in any IOMMU domain. 394 + 395 + This helps detect DMA use-after-free bugs where a driver frees a page 396 + but forgets to unmap it from the IOMMU, potentially allowing a device 397 + to overwrite memory that the kernel has repurposed. 398 + 399 + These checks are best-effort and may not detect all problems. 400 + 401 + Due to performance overhead, this feature is disabled by default. 402 + You must enable "iommu.debug_pagealloc" from the kernel command 403 + line to activate the runtime checks. 404 + 405 + If unsure, say N. 387 406 endif # IOMMU_SUPPORT 388 407 389 408 source "drivers/iommu/generic_pt/Kconfig"

+1

drivers/iommu/Makefile

··· 36 36 obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o 37 37 obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o 38 38 obj-$(CONFIG_APPLE_DART) += apple-dart.o 39 + obj-$(CONFIG_IOMMU_DEBUG_PAGEALLOC) += iommu-debug-pagealloc.o

+10

drivers/iommu/amd/Kconfig

··· 30 30 your BIOS for an option to enable it or if you have an IVRS ACPI 31 31 table. 32 32 33 + config AMD_IOMMU_IOMMUFD 34 + bool "Enable IOMMUFD features for AMD IOMMU (EXPERIMENTAL)" 35 + depends on IOMMUFD 36 + depends on AMD_IOMMU 37 + help 38 + Support for IOMMUFD features intended to support virtual machines 39 + with accelerated virtual IOMMUs. 40 + 41 + Say Y here if you are doing development and testing on this feature. 42 + 33 43 config AMD_IOMMU_DEBUGFS 34 44 bool "Enable AMD IOMMU internals in DebugFS" 35 45 depends on AMD_IOMMU && IOMMU_DEBUGFS

+1

drivers/iommu/amd/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 obj-y += iommu.o init.o quirks.o ppr.o pasid.o 3 + obj-$(CONFIG_AMD_IOMMU_IOMMUFD) += iommufd.o nested.o 3 4 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o

+33

drivers/iommu/amd/amd_iommu.h

··· 189 189 struct dev_table_entry *get_dev_table(struct amd_iommu *iommu); 190 190 struct iommu_dev_data *search_dev_data(struct amd_iommu *iommu, u16 devid); 191 191 192 + void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, 193 + struct protection_domain *domain, u16 domid, 194 + struct pt_iommu_amdv1_hw_info *pt_info, 195 + struct dev_table_entry *new); 196 + void amd_iommu_update_dte(struct amd_iommu *iommu, 197 + struct iommu_dev_data *dev_data, 198 + struct dev_table_entry *new); 199 + 200 + static inline void 201 + amd_iommu_make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *new) 202 + { 203 + struct dev_table_entry *initial_dte; 204 + struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 205 + 206 + /* All existing DTE must have V bit set */ 207 + new->data128[0] = DTE_FLAG_V; 208 + new->data128[1] = 0; 209 + 210 + /* 211 + * Restore cached persistent DTE bits, which can be set by information 212 + * in IVRS table. See set_dev_entry_from_acpi(). 213 + */ 214 + initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); 215 + if (initial_dte) { 216 + new->data128[0] |= initial_dte->data128[0]; 217 + new->data128[1] |= initial_dte->data128[1]; 218 + } 219 + } 220 + 221 + /* NESTED */ 222 + struct iommu_domain * 223 + amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, 224 + const struct iommu_user_data *user_data); 192 225 #endif /* AMD_IOMMU_H */

+48 -2

drivers/iommu/amd/amd_iommu_types.h

··· 17 17 #include <linux/list.h> 18 18 #include <linux/spinlock.h> 19 19 #include <linux/pci.h> 20 + #include <linux/iommufd.h> 20 21 #include <linux/irqreturn.h> 21 22 #include <linux/generic_pt/iommu.h> 23 + 24 + #include <uapi/linux/iommufd.h> 22 25 23 26 /* 24 27 * Maximum number of IOMMUs supported ··· 111 108 112 109 /* Extended Feature 2 Bits */ 113 110 #define FEATURE_SEVSNPIO_SUP BIT_ULL(1) 111 + #define FEATURE_GCR3TRPMODE BIT_ULL(3) 114 112 #define FEATURE_SNPAVICSUP GENMASK_ULL(7, 5) 115 113 #define FEATURE_SNPAVICSUP_GAM(x) \ 116 114 (FIELD_GET(FEATURE_SNPAVICSUP, x) == 0x1) ··· 190 186 #define CONTROL_EPH_EN 45 191 187 #define CONTROL_XT_EN 50 192 188 #define CONTROL_INTCAPXT_EN 51 189 + #define CONTROL_GCR3TRPMODE 58 193 190 #define CONTROL_IRTCACHEDIS 59 194 191 #define CONTROL_SNPAVIC_EN 61 195 192 ··· 355 350 #define DTE_FLAG_V BIT_ULL(0) 356 351 #define DTE_FLAG_TV BIT_ULL(1) 357 352 #define DTE_FLAG_HAD (3ULL << 7) 353 + #define DTE_MODE_MASK GENMASK_ULL(11, 9) 354 + #define DTE_HOST_TRP GENMASK_ULL(51, 12) 355 + #define DTE_FLAG_PPR BIT_ULL(52) 358 356 #define DTE_FLAG_GIOV BIT_ULL(54) 359 357 #define DTE_FLAG_GV BIT_ULL(55) 360 358 #define DTE_GLX GENMASK_ULL(57, 56) ··· 366 358 367 359 #define DTE_FLAG_IOTLB BIT_ULL(32) 368 360 #define DTE_FLAG_MASK (0x3ffULL << 32) 369 - #define DEV_DOMID_MASK 0xffffULL 361 + #define DTE_DOMID_MASK GENMASK_ULL(15, 0) 370 362 371 363 #define DTE_GCR3_14_12 GENMASK_ULL(60, 58) 372 364 #define DTE_GCR3_30_15 GENMASK_ULL(31, 16) ··· 501 493 u32 refcnt; /* Count of attached dev/pasid per domain/IOMMU */ 502 494 }; 503 495 496 + struct amd_iommu_viommu { 497 + struct iommufd_viommu core; 498 + struct protection_domain *parent; /* nest parent domain for this viommu */ 499 + struct list_head pdom_list; /* For protection_domain->viommu_list */ 500 + 501 + /* 502 + * Per-vIOMMU guest domain ID to host domain ID mapping. 503 + * Indexed by guest domain ID. 504 + */ 505 + struct xarray gdomid_array; 506 + }; 507 + 508 + /* 509 + * Contains guest domain ID mapping info, 510 + * which is stored in the struct xarray gdomid_array. 511 + */ 512 + struct guest_domain_mapping_info { 513 + refcount_t users; 514 + u32 hdom_id; /* Host domain ID */ 515 + }; 516 + 517 + /* 518 + * Nested domain is specifically used for nested translation 519 + */ 520 + struct nested_domain { 521 + struct iommu_domain domain; /* generic domain handle used by iommu core code */ 522 + u16 gdom_id; /* domain ID from gDTE */ 523 + struct guest_domain_mapping_info *gdom_info; 524 + struct iommu_hwpt_amd_guest gdte; /* Guest vIOMMU DTE */ 525 + struct amd_iommu_viommu *viommu; /* AMD hw-viommu this nested domain belong to */ 526 + }; 527 + 504 528 /* 505 529 * This structure contains generic data for IOMMU protection domains 506 530 * independent of their use. ··· 553 513 554 514 struct mmu_notifier mn; /* mmu notifier for the SVA domain */ 555 515 struct list_head dev_data_list; /* List of pdom_dev_data */ 516 + 517 + /* 518 + * Store reference to list of vIOMMUs, which use this protection domain. 519 + * This will be used to look up host domain ID when flushing this domain. 520 + */ 521 + struct list_head viommu_list; 556 522 }; 557 523 PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); 558 524 PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); ··· 752 706 753 707 u32 flags; 754 708 volatile u64 *cmd_sem; 755 - atomic64_t cmd_sem_val; 709 + u64 cmd_sem_val; 756 710 /* 757 711 * Track physical address to directly use it in build_completion_wait() 758 712 * and avoid adding any special checks and handling for kdump.

+10 -2

drivers/iommu/amd/init.c

··· 1122 1122 return; 1123 1123 1124 1124 iommu_feature_enable(iommu, CONTROL_GT_EN); 1125 + 1126 + /* 1127 + * This feature needs to be enabled prior to a call 1128 + * to iommu_snp_enable(). Since this function is called 1129 + * in early_enable_iommu(), it is safe to enable here. 1130 + */ 1131 + if (check_feature2(FEATURE_GCR3TRPMODE)) 1132 + iommu_feature_enable(iommu, CONTROL_GCR3TRPMODE); 1125 1133 } 1126 1134 1127 1135 /* sets a specific bit in the device table entry. */ ··· 1187 1179 for (devid = 0; devid <= pci_seg->last_bdf; devid++) { 1188 1180 old_dev_tbl_entry = &pci_seg->old_dev_tbl_cpy[devid]; 1189 1181 dte_v = FIELD_GET(DTE_FLAG_V, old_dev_tbl_entry->data[0]); 1190 - dom_id = FIELD_GET(DEV_DOMID_MASK, old_dev_tbl_entry->data[1]); 1182 + dom_id = FIELD_GET(DTE_DOMID_MASK, old_dev_tbl_entry->data[1]); 1191 1183 1192 1184 if (!dte_v || !dom_id) 1193 1185 continue; ··· 1885 1877 iommu->pci_seg = pci_seg; 1886 1878 1887 1879 raw_spin_lock_init(&iommu->lock); 1888 - atomic64_set(&iommu->cmd_sem_val, 0); 1880 + iommu->cmd_sem_val = 0; 1889 1881 1890 1882 /* Add IOMMU to internal data structures */ 1891 1883 list_add_tail(&iommu->list, &amd_iommu_list);

+192 -119

drivers/iommu/amd/iommu.c

··· 43 43 #include <linux/generic_pt/iommu.h> 44 44 45 45 #include "amd_iommu.h" 46 + #include "iommufd.h" 46 47 #include "../irq_remapping.h" 47 48 #include "../iommu-pages.h" 48 49 ··· 76 75 struct iommu_dev_data *dev_data, 77 76 phys_addr_t top_paddr, unsigned int top_level); 78 77 78 + static int device_flush_dte(struct iommu_dev_data *dev_data); 79 + 79 80 static void amd_iommu_change_top(struct pt_iommu *iommu_table, 80 81 phys_addr_t top_paddr, unsigned int top_level); 81 82 ··· 87 84 static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); 88 85 static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 89 86 bool enable); 87 + 88 + static void clone_aliases(struct amd_iommu *iommu, struct device *dev); 89 + 90 + static int iommu_completion_wait(struct amd_iommu *iommu); 90 91 91 92 /**************************************************************************** 92 93 * ··· 207 200 } 208 201 209 202 spin_unlock_irqrestore(&dev_data->dte_lock, flags); 203 + } 204 + 205 + void amd_iommu_update_dte(struct amd_iommu *iommu, 206 + struct iommu_dev_data *dev_data, 207 + struct dev_table_entry *new) 208 + { 209 + update_dte256(iommu, dev_data, new); 210 + clone_aliases(iommu, dev_data->dev); 211 + device_flush_dte(dev_data); 212 + iommu_completion_wait(iommu); 210 213 } 211 214 212 215 static void get_dte256(struct amd_iommu *iommu, struct iommu_dev_data *dev_data, ··· 1197 1180 { 1198 1181 int i = 0; 1199 1182 1200 - while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) { 1183 + /* 1184 + * cmd_sem holds a monotonically non-decreasing completion sequence 1185 + * number. 1186 + */ 1187 + while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 && 1188 + i < LOOP_TIMEOUT) { 1201 1189 udelay(1); 1202 1190 i += 1; 1203 1191 } ··· 1434 1412 return iommu_queue_command_sync(iommu, cmd, true); 1435 1413 } 1436 1414 1415 + static u64 get_cmdsem_val(struct amd_iommu *iommu) 1416 + { 1417 + lockdep_assert_held(&iommu->lock); 1418 + return ++iommu->cmd_sem_val; 1419 + } 1420 + 1437 1421 /* 1438 1422 * This function queues a completion wait command into the command 1439 1423 * buffer of an IOMMU ··· 1454 1426 if (!iommu->need_sync) 1455 1427 return 0; 1456 1428 1457 - data = atomic64_inc_return(&iommu->cmd_sem_val); 1458 - build_completion_wait(&cmd, iommu, data); 1459 - 1460 1429 raw_spin_lock_irqsave(&iommu->lock, flags); 1461 1430 1431 + data = get_cmdsem_val(iommu); 1432 + build_completion_wait(&cmd, iommu, data); 1433 + 1462 1434 ret = __iommu_queue_command_sync(iommu, &cmd, false); 1435 + raw_spin_unlock_irqrestore(&iommu->lock, flags); 1436 + 1463 1437 if (ret) 1464 - goto out_unlock; 1438 + return ret; 1465 1439 1466 1440 ret = wait_on_sem(iommu, data); 1467 - 1468 - out_unlock: 1469 - raw_spin_unlock_irqrestore(&iommu->lock, flags); 1470 1441 1471 1442 return ret; 1472 1443 } ··· 1542 1515 iommu_queue_command(iommu, &cmd); 1543 1516 1544 1517 iommu_completion_wait(iommu); 1518 + } 1519 + 1520 + static int iommu_flush_pages_v1_hdom_ids(struct protection_domain *pdom, u64 address, size_t size) 1521 + { 1522 + int ret = 0; 1523 + struct amd_iommu_viommu *aviommu; 1524 + 1525 + list_for_each_entry(aviommu, &pdom->viommu_list, pdom_list) { 1526 + unsigned long i; 1527 + struct guest_domain_mapping_info *gdom_info; 1528 + struct amd_iommu *iommu = container_of(aviommu->core.iommu_dev, 1529 + struct amd_iommu, iommu); 1530 + 1531 + xa_lock(&aviommu->gdomid_array); 1532 + xa_for_each(&aviommu->gdomid_array, i, gdom_info) { 1533 + struct iommu_cmd cmd; 1534 + 1535 + pr_debug("%s: iommu=%#x, hdom_id=%#x\n", __func__, 1536 + iommu->devid, gdom_info->hdom_id); 1537 + build_inv_iommu_pages(&cmd, address, size, gdom_info->hdom_id, 1538 + IOMMU_NO_PASID, false); 1539 + ret |= iommu_queue_command(iommu, &cmd); 1540 + } 1541 + xa_unlock(&aviommu->gdomid_array); 1542 + } 1543 + return ret; 1545 1544 } 1546 1545 1547 1546 static void amd_iommu_flush_all(struct amd_iommu *iommu) ··· 1717 1664 */ 1718 1665 ret |= iommu_queue_command(pdom_iommu_info->iommu, &cmd); 1719 1666 } 1667 + 1668 + /* 1669 + * A domain w/ v1 table can be a nest parent, which can have 1670 + * multiple nested domains. Each nested domain has 1:1 mapping 1671 + * between gDomID and hDomID. Therefore, flush every hDomID 1672 + * associated to this nest parent domain. 1673 + * 1674 + * See drivers/iommu/amd/nested.c: amd_iommu_alloc_domain_nested() 1675 + */ 1676 + if (!list_empty(&pdom->viommu_list)) 1677 + ret |= iommu_flush_pages_v1_hdom_ids(pdom, address, size); 1720 1678 1721 1679 return ret; 1722 1680 } ··· 2069 2005 return ret; 2070 2006 } 2071 2007 2072 - static void make_clear_dte(struct iommu_dev_data *dev_data, struct dev_table_entry *ptr, 2073 - struct dev_table_entry *new) 2074 - { 2075 - /* All existing DTE must have V bit set */ 2076 - new->data128[0] = DTE_FLAG_V; 2077 - new->data128[1] = 0; 2078 - } 2079 - 2080 2008 /* 2081 2009 * Note: 2082 2010 * The old value for GCR3 table and GPT have been cleared from caller. 2083 2011 */ 2084 - static void set_dte_gcr3_table(struct amd_iommu *iommu, 2085 - struct iommu_dev_data *dev_data, 2086 - struct dev_table_entry *target) 2012 + static void set_dte_gcr3_table(struct iommu_dev_data *dev_data, 2013 + struct dev_table_entry *new) 2087 2014 { 2088 2015 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2089 - u64 gcr3; 2016 + u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); 2090 2017 2091 - if (!gcr3_info->gcr3_tbl) 2092 - return; 2018 + new->data[0] |= DTE_FLAG_TV | 2019 + (dev_data->ppr ? DTE_FLAG_PPR : 0) | 2020 + (pdom_is_v2_pgtbl_mode(dev_data->domain) ? DTE_FLAG_GIOV : 0) | 2021 + DTE_FLAG_GV | 2022 + FIELD_PREP(DTE_GLX, gcr3_info->glx) | 2023 + FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12) | 2024 + DTE_FLAG_IR | DTE_FLAG_IW; 2093 2025 2094 - pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n", 2095 - __func__, dev_data->devid, gcr3_info->glx, 2096 - (unsigned long long)gcr3_info->gcr3_tbl); 2097 - 2098 - gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl); 2099 - 2100 - target->data[0] |= DTE_FLAG_GV | 2101 - FIELD_PREP(DTE_GLX, gcr3_info->glx) | 2102 - FIELD_PREP(DTE_GCR3_14_12, gcr3 >> 12); 2103 - if (pdom_is_v2_pgtbl_mode(dev_data->domain)) 2104 - target->data[0] |= DTE_FLAG_GIOV; 2105 - 2106 - target->data[1] |= FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | 2107 - FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); 2026 + new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, dev_data->gcr3_info.domid) | 2027 + FIELD_PREP(DTE_GCR3_30_15, gcr3 >> 15) | 2028 + (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0) | 2029 + FIELD_PREP(DTE_GCR3_51_31, gcr3 >> 31); 2108 2030 2109 2031 /* Guest page table can only support 4 and 5 levels */ 2110 2032 if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) 2111 - target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); 2033 + new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_5_LEVEL); 2112 2034 else 2113 - target->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); 2035 + new->data[2] |= FIELD_PREP(DTE_GPT_LEVEL_MASK, GUEST_PGTABLE_4_LEVEL); 2036 + } 2037 + 2038 + void amd_iommu_set_dte_v1(struct iommu_dev_data *dev_data, 2039 + struct protection_domain *domain, u16 domid, 2040 + struct pt_iommu_amdv1_hw_info *pt_info, 2041 + struct dev_table_entry *new) 2042 + { 2043 + u64 host_pt_root = __sme_set(pt_info->host_pt_root); 2044 + 2045 + /* Note Dirty tracking is used for v1 table only for now */ 2046 + new->data[0] |= DTE_FLAG_TV | 2047 + FIELD_PREP(DTE_MODE_MASK, pt_info->mode) | 2048 + (domain->dirty_tracking ? DTE_FLAG_HAD : 0) | 2049 + FIELD_PREP(DTE_HOST_TRP, host_pt_root >> 12) | 2050 + DTE_FLAG_IR | DTE_FLAG_IW; 2051 + 2052 + new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domid) | 2053 + (dev_data->ats_enabled ? DTE_FLAG_IOTLB : 0); 2054 + } 2055 + 2056 + static void set_dte_v1(struct iommu_dev_data *dev_data, 2057 + struct protection_domain *domain, u16 domid, 2058 + phys_addr_t top_paddr, unsigned int top_level, 2059 + struct dev_table_entry *new) 2060 + { 2061 + struct pt_iommu_amdv1_hw_info pt_info; 2062 + 2063 + /* 2064 + * When updating the IO pagetable, the new top and level 2065 + * are provided as parameters. For other operations i.e. 2066 + * device attach, retrieve the current pagetable info 2067 + * via the IOMMU PT API. 2068 + */ 2069 + if (top_paddr) { 2070 + pt_info.host_pt_root = top_paddr; 2071 + pt_info.mode = top_level + 1; 2072 + } else { 2073 + WARN_ON(top_paddr || top_level); 2074 + pt_iommu_amdv1_hw_info(&domain->amdv1, &pt_info); 2075 + } 2076 + 2077 + amd_iommu_set_dte_v1(dev_data, domain, domid, &pt_info, new); 2078 + } 2079 + 2080 + static void set_dte_passthrough(struct iommu_dev_data *dev_data, 2081 + struct protection_domain *domain, 2082 + struct dev_table_entry *new) 2083 + { 2084 + new->data[0] |= DTE_FLAG_TV | DTE_FLAG_IR | DTE_FLAG_IW; 2085 + 2086 + new->data[1] |= FIELD_PREP(DTE_DOMID_MASK, domain->id) | 2087 + (dev_data->ats_enabled) ? DTE_FLAG_IOTLB : 0; 2114 2088 } 2115 2089 2116 2090 static void set_dte_entry(struct amd_iommu *iommu, 2117 2091 struct iommu_dev_data *dev_data, 2118 2092 phys_addr_t top_paddr, unsigned int top_level) 2119 2093 { 2120 - u16 domid; 2121 2094 u32 old_domid; 2122 - struct dev_table_entry *initial_dte; 2123 2095 struct dev_table_entry new = {}; 2124 2096 struct protection_domain *domain = dev_data->domain; 2125 2097 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2126 2098 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2127 - struct pt_iommu_amdv1_hw_info pt_info; 2128 2099 2129 - make_clear_dte(dev_data, dte, &new); 2100 + amd_iommu_make_clear_dte(dev_data, &new); 2130 2101 2131 - if (gcr3_info && gcr3_info->gcr3_tbl) 2132 - domid = dev_data->gcr3_info.domid; 2133 - else { 2134 - domid = domain->id; 2102 + old_domid = READ_ONCE(dte->data[1]) & DTE_DOMID_MASK; 2103 + if (gcr3_info->gcr3_tbl) 2104 + set_dte_gcr3_table(dev_data, &new); 2105 + else if (domain->domain.type == IOMMU_DOMAIN_IDENTITY) 2106 + set_dte_passthrough(dev_data, domain, &new); 2107 + else if ((domain->domain.type & __IOMMU_DOMAIN_PAGING) && 2108 + domain->pd_mode == PD_MODE_V1) 2109 + set_dte_v1(dev_data, domain, domain->id, top_paddr, top_level, &new); 2110 + else 2111 + WARN_ON(true); 2135 2112 2136 - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { 2137 - /* 2138 - * When updating the IO pagetable, the new top and level 2139 - * are provided as parameters. For other operations i.e. 2140 - * device attach, retrieve the current pagetable info 2141 - * via the IOMMU PT API. 2142 - */ 2143 - if (top_paddr) { 2144 - pt_info.host_pt_root = top_paddr; 2145 - pt_info.mode = top_level + 1; 2146 - } else { 2147 - WARN_ON(top_paddr || top_level); 2148 - pt_iommu_amdv1_hw_info(&domain->amdv1, 2149 - &pt_info); 2150 - } 2151 - 2152 - new.data[0] |= __sme_set(pt_info.host_pt_root) | 2153 - (pt_info.mode & DEV_ENTRY_MODE_MASK) 2154 - << DEV_ENTRY_MODE_SHIFT; 2155 - } 2156 - } 2157 - 2158 - new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; 2159 - 2160 - /* 2161 - * When SNP is enabled, we can only support TV=1 with non-zero domain ID. 2162 - * This is prevented by the SNP-enable and IOMMU_DOMAIN_IDENTITY check in 2163 - * do_iommu_domain_alloc(). 2164 - */ 2165 - WARN_ON(amd_iommu_snp_en && (domid == 0)); 2166 - new.data[0] |= DTE_FLAG_TV; 2167 - 2168 - if (dev_data->ppr) 2169 - new.data[0] |= 1ULL << DEV_ENTRY_PPR; 2170 - 2171 - if (domain->dirty_tracking) 2172 - new.data[0] |= DTE_FLAG_HAD; 2173 - 2174 - if (dev_data->ats_enabled) 2175 - new.data[1] |= DTE_FLAG_IOTLB; 2176 - 2177 - old_domid = READ_ONCE(dte->data[1]) & DEV_DOMID_MASK; 2178 - new.data[1] |= domid; 2179 - 2180 - /* 2181 - * Restore cached persistent DTE bits, which can be set by information 2182 - * in IVRS table. See set_dev_entry_from_acpi(). 2183 - */ 2184 - initial_dte = amd_iommu_get_ivhd_dte_flags(iommu->pci_seg->id, dev_data->devid); 2185 - if (initial_dte) { 2186 - new.data128[0] |= initial_dte->data128[0]; 2187 - new.data128[1] |= initial_dte->data128[1]; 2188 - } 2189 - 2190 - set_dte_gcr3_table(iommu, dev_data, &new); 2191 - 2192 - update_dte256(iommu, dev_data, &new); 2113 + amd_iommu_update_dte(iommu, dev_data, &new); 2193 2114 2194 2115 /* 2195 2116 * A kdump kernel might be replacing a domain ID that was copied from ··· 2192 2143 static void clear_dte_entry(struct amd_iommu *iommu, struct iommu_dev_data *dev_data) 2193 2144 { 2194 2145 struct dev_table_entry new = {}; 2195 - struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2196 2146 2197 - make_clear_dte(dev_data, dte, &new); 2198 - update_dte256(iommu, dev_data, &new); 2147 + amd_iommu_make_clear_dte(dev_data, &new); 2148 + amd_iommu_update_dte(iommu, dev_data, &new); 2199 2149 } 2200 2150 2201 2151 /* Update and flush DTE for the given device */ ··· 2206 2158 set_dte_entry(iommu, dev_data, 0, 0); 2207 2159 else 2208 2160 clear_dte_entry(iommu, dev_data); 2209 - 2210 - clone_aliases(iommu, dev_data->dev); 2211 - device_flush_dte(dev_data); 2212 - iommu_completion_wait(iommu); 2213 2161 } 2214 2162 2215 2163 /* ··· 2538 2494 spin_lock_init(&domain->lock); 2539 2495 INIT_LIST_HEAD(&domain->dev_list); 2540 2496 INIT_LIST_HEAD(&domain->dev_data_list); 2497 + INIT_LIST_HEAD(&domain->viommu_list); 2541 2498 xa_init(&domain->iommu_array); 2542 2499 } 2543 2500 ··· 2800 2755 return &domain->domain; 2801 2756 } 2802 2757 2758 + static inline bool is_nest_parent_supported(u32 flags) 2759 + { 2760 + /* Only allow nest parent when these features are supported */ 2761 + return check_feature(FEATURE_GT) && 2762 + check_feature(FEATURE_GIOSUP) && 2763 + check_feature2(FEATURE_GCR3TRPMODE); 2764 + } 2765 + 2803 2766 static struct iommu_domain * 2804 2767 amd_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, 2805 2768 const struct iommu_user_data *user_data) ··· 2815 2762 { 2816 2763 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2817 2764 const u32 supported_flags = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | 2818 - IOMMU_HWPT_ALLOC_PASID; 2765 + IOMMU_HWPT_ALLOC_PASID | 2766 + IOMMU_HWPT_ALLOC_NEST_PARENT; 2819 2767 2820 2768 if ((flags & ~supported_flags) || user_data) 2821 2769 return ERR_PTR(-EOPNOTSUPP); 2822 2770 2823 2771 switch (flags & supported_flags) { 2824 2772 case IOMMU_HWPT_ALLOC_DIRTY_TRACKING: 2825 - /* Allocate domain with v1 page table for dirty tracking */ 2826 - if (!amd_iommu_hd_support(iommu)) 2773 + case IOMMU_HWPT_ALLOC_NEST_PARENT: 2774 + case IOMMU_HWPT_ALLOC_DIRTY_TRACKING | IOMMU_HWPT_ALLOC_NEST_PARENT: 2775 + /* 2776 + * Allocate domain with v1 page table for dirty tracking 2777 + * and/or Nest parent. 2778 + */ 2779 + if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && 2780 + !amd_iommu_hd_support(iommu)) 2827 2781 break; 2782 + 2783 + if ((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) && 2784 + !is_nest_parent_supported(flags)) 2785 + break; 2786 + 2828 2787 return amd_iommu_domain_alloc_paging_v1(dev, flags); 2829 2788 case IOMMU_HWPT_ALLOC_PASID: 2830 2789 /* Allocate domain with v2 page table if IOMMU supports PASID. */ ··· 3138 3073 3139 3074 const struct iommu_ops amd_iommu_ops = { 3140 3075 .capable = amd_iommu_capable, 3076 + .hw_info = amd_iommufd_hw_info, 3141 3077 .blocked_domain = &blocked_domain, 3142 3078 .release_domain = &blocked_domain, 3143 3079 .identity_domain = &identity_domain.domain, ··· 3151 3085 .is_attach_deferred = amd_iommu_is_attach_deferred, 3152 3086 .def_domain_type = amd_iommu_def_domain_type, 3153 3087 .page_response = amd_iommu_page_response, 3088 + .get_viommu_size = amd_iommufd_get_viommu_size, 3089 + .viommu_init = amd_iommufd_viommu_init, 3154 3090 }; 3155 3091 3156 3092 #ifdef CONFIG_IRQ_REMAP ··· 3177 3109 return; 3178 3110 3179 3111 build_inv_irt(&cmd, devid); 3180 - data = atomic64_inc_return(&iommu->cmd_sem_val); 3181 - build_completion_wait(&cmd2, iommu, data); 3182 3112 3183 3113 raw_spin_lock_irqsave(&iommu->lock, flags); 3114 + data = get_cmdsem_val(iommu); 3115 + build_completion_wait(&cmd2, iommu, data); 3116 + 3184 3117 ret = __iommu_queue_command_sync(iommu, &cmd, true); 3185 3118 if (ret) 3186 - goto out; 3119 + goto out_err; 3187 3120 ret = __iommu_queue_command_sync(iommu, &cmd2, false); 3188 3121 if (ret) 3189 - goto out; 3122 + goto out_err; 3123 + raw_spin_unlock_irqrestore(&iommu->lock, flags); 3124 + 3190 3125 wait_on_sem(iommu, data); 3191 - out: 3126 + return; 3127 + 3128 + out_err: 3192 3129 raw_spin_unlock_irqrestore(&iommu->lock, flags); 3193 3130 } 3194 3131 ··· 3307 3234 struct irq_remap_table *new_table = NULL; 3308 3235 struct amd_iommu_pci_seg *pci_seg; 3309 3236 unsigned long flags; 3310 - int nid = iommu && iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 3237 + int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 3311 3238 u16 alias; 3312 3239 3313 3240 spin_lock_irqsave(&iommu_table_lock, flags);

+77

drivers/iommu/amd/iommufd.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2025 Advanced Micro Devices, Inc. 4 + */ 5 + 6 + #include <linux/iommu.h> 7 + 8 + #include "iommufd.h" 9 + #include "amd_iommu.h" 10 + #include "amd_iommu_types.h" 11 + 12 + static const struct iommufd_viommu_ops amd_viommu_ops; 13 + 14 + void *amd_iommufd_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type) 15 + { 16 + struct iommu_hw_info_amd *hwinfo; 17 + 18 + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && 19 + *type != IOMMU_HW_INFO_TYPE_AMD) 20 + return ERR_PTR(-EOPNOTSUPP); 21 + 22 + hwinfo = kzalloc(sizeof(*hwinfo), GFP_KERNEL); 23 + if (!hwinfo) 24 + return ERR_PTR(-ENOMEM); 25 + 26 + *length = sizeof(*hwinfo); 27 + *type = IOMMU_HW_INFO_TYPE_AMD; 28 + 29 + hwinfo->efr = amd_iommu_efr; 30 + hwinfo->efr2 = amd_iommu_efr2; 31 + 32 + return hwinfo; 33 + } 34 + 35 + size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type) 36 + { 37 + return VIOMMU_STRUCT_SIZE(struct amd_iommu_viommu, core); 38 + } 39 + 40 + int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent, 41 + const struct iommu_user_data *user_data) 42 + { 43 + unsigned long flags; 44 + struct protection_domain *pdom = to_pdomain(parent); 45 + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); 46 + 47 + xa_init_flags(&aviommu->gdomid_array, XA_FLAGS_ALLOC1); 48 + aviommu->parent = pdom; 49 + 50 + viommu->ops = &amd_viommu_ops; 51 + 52 + spin_lock_irqsave(&pdom->lock, flags); 53 + list_add(&aviommu->pdom_list, &pdom->viommu_list); 54 + spin_unlock_irqrestore(&pdom->lock, flags); 55 + 56 + return 0; 57 + } 58 + 59 + static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu) 60 + { 61 + unsigned long flags; 62 + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); 63 + struct protection_domain *pdom = aviommu->parent; 64 + 65 + spin_lock_irqsave(&pdom->lock, flags); 66 + list_del(&aviommu->pdom_list); 67 + spin_unlock_irqrestore(&pdom->lock, flags); 68 + xa_destroy(&aviommu->gdomid_array); 69 + } 70 + 71 + /* 72 + * See include/linux/iommufd.h 73 + * struct iommufd_viommu_ops - vIOMMU specific operations 74 + */ 75 + static const struct iommufd_viommu_ops amd_viommu_ops = { 76 + .destroy = amd_iommufd_viommu_destroy, 77 + };

+20

drivers/iommu/amd/iommufd.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (C) 2025 Advanced Micro Devices, Inc. 4 + */ 5 + 6 + #ifndef AMD_IOMMUFD_H 7 + #define AMD_IOMMUFD_H 8 + 9 + #if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD) 10 + void *amd_iommufd_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type); 11 + size_t amd_iommufd_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type); 12 + int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *parent, 13 + const struct iommu_user_data *user_data); 14 + #else 15 + #define amd_iommufd_hw_info NULL 16 + #define amd_iommufd_viommu_init NULL 17 + #define amd_iommufd_get_viommu_size NULL 18 + #endif /* CONFIG_AMD_IOMMU_IOMMUFD */ 19 + 20 + #endif /* AMD_IOMMUFD_H */

+294

drivers/iommu/amd/nested.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2025 Advanced Micro Devices, Inc. 4 + */ 5 + 6 + #define dev_fmt(fmt) "AMD-Vi: " fmt 7 + 8 + #include <linux/iommu.h> 9 + #include <linux/refcount.h> 10 + #include <uapi/linux/iommufd.h> 11 + 12 + #include "amd_iommu.h" 13 + 14 + static const struct iommu_domain_ops nested_domain_ops; 15 + 16 + static inline struct nested_domain *to_ndomain(struct iommu_domain *dom) 17 + { 18 + return container_of(dom, struct nested_domain, domain); 19 + } 20 + 21 + /* 22 + * Validate guest DTE to make sure that configuration for host (v1) 23 + * and guest (v2) page tables are valid when allocating nested domain. 24 + */ 25 + static int validate_gdte_nested(struct iommu_hwpt_amd_guest *gdte) 26 + { 27 + u32 gpt_level = FIELD_GET(DTE_GPT_LEVEL_MASK, gdte->dte[2]); 28 + 29 + /* Must be zero: Mode, Host-TPR */ 30 + if (FIELD_GET(DTE_MODE_MASK, gdte->dte[0]) != 0 || 31 + FIELD_GET(DTE_HOST_TRP, gdte->dte[0]) != 0) 32 + return -EINVAL; 33 + 34 + /* GCR3 TRP must be non-zero if V, GV is set */ 35 + if (FIELD_GET(DTE_FLAG_V, gdte->dte[0]) == 1 && 36 + FIELD_GET(DTE_FLAG_GV, gdte->dte[0]) == 1 && 37 + FIELD_GET(DTE_GCR3_14_12, gdte->dte[0]) == 0 && 38 + FIELD_GET(DTE_GCR3_30_15, gdte->dte[1]) == 0 && 39 + FIELD_GET(DTE_GCR3_51_31, gdte->dte[1]) == 0) 40 + return -EINVAL; 41 + 42 + /* Valid Guest Paging Mode values are 0 and 1 */ 43 + if (gpt_level != GUEST_PGTABLE_4_LEVEL && 44 + gpt_level != GUEST_PGTABLE_5_LEVEL) 45 + return -EINVAL; 46 + 47 + /* GLX = 3 is reserved */ 48 + if (FIELD_GET(DTE_GLX, gdte->dte[0]) == 3) 49 + return -EINVAL; 50 + 51 + /* 52 + * We need to check host capability before setting 53 + * the Guest Paging Mode 54 + */ 55 + if (gpt_level == GUEST_PGTABLE_5_LEVEL && 56 + amd_iommu_gpt_level < PAGE_MODE_5_LEVEL) 57 + return -EOPNOTSUPP; 58 + 59 + return 0; 60 + } 61 + 62 + static void *gdom_info_load_or_alloc_locked(struct xarray *xa, unsigned long index) 63 + { 64 + struct guest_domain_mapping_info *elm, *res; 65 + 66 + elm = xa_load(xa, index); 67 + if (elm) 68 + return elm; 69 + 70 + xa_unlock(xa); 71 + elm = kzalloc(sizeof(struct guest_domain_mapping_info), GFP_KERNEL); 72 + xa_lock(xa); 73 + if (!elm) 74 + return ERR_PTR(-ENOMEM); 75 + 76 + res = __xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); 77 + if (xa_is_err(res)) 78 + res = ERR_PTR(xa_err(res)); 79 + 80 + if (res) { 81 + kfree(elm); 82 + return res; 83 + } 84 + 85 + refcount_set(&elm->users, 0); 86 + return elm; 87 + } 88 + 89 + /* 90 + * This function is assigned to struct iommufd_viommu_ops.alloc_domain_nested() 91 + * during the call to struct iommu_ops.viommu_init(). 92 + */ 93 + struct iommu_domain * 94 + amd_iommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, 95 + const struct iommu_user_data *user_data) 96 + { 97 + int ret; 98 + struct nested_domain *ndom; 99 + struct guest_domain_mapping_info *gdom_info; 100 + struct amd_iommu_viommu *aviommu = container_of(viommu, struct amd_iommu_viommu, core); 101 + 102 + if (user_data->type != IOMMU_HWPT_DATA_AMD_GUEST) 103 + return ERR_PTR(-EOPNOTSUPP); 104 + 105 + ndom = kzalloc(sizeof(*ndom), GFP_KERNEL); 106 + if (!ndom) 107 + return ERR_PTR(-ENOMEM); 108 + 109 + ret = iommu_copy_struct_from_user(&ndom->gdte, user_data, 110 + IOMMU_HWPT_DATA_AMD_GUEST, 111 + dte); 112 + if (ret) 113 + goto out_err; 114 + 115 + ret = validate_gdte_nested(&ndom->gdte); 116 + if (ret) 117 + goto out_err; 118 + 119 + ndom->gdom_id = FIELD_GET(DTE_DOMID_MASK, ndom->gdte.dte[1]); 120 + ndom->domain.ops = &nested_domain_ops; 121 + ndom->domain.type = IOMMU_DOMAIN_NESTED; 122 + ndom->viommu = aviommu; 123 + 124 + /* 125 + * Normally, when a guest has multiple pass-through devices, 126 + * the IOMMU driver setup DTEs with the same stage-2 table and 127 + * use the same host domain ID (hDomId). In case of nested translation, 128 + * if the guest setup different stage-1 tables with same PASID, 129 + * IOMMU would use the same TLB tag. This will results in TLB 130 + * aliasing issue. 131 + * 132 + * The guest is assigning gDomIDs based on its own algorithm for managing 133 + * cache tags of (DomID, PASID). Within a single viommu, the nest parent domain 134 + * (w/ S2 table) is used by all DTEs. But we need to consistently map the gDomID 135 + * to a single hDomID. This is done using an xarray in the vIOMMU to 136 + * keep track of the gDomID mapping. When the S2 is changed, the INVALIDATE_IOMMU_PAGES 137 + * command must be issued for each hDomID in the xarray. 138 + */ 139 + xa_lock(&aviommu->gdomid_array); 140 + 141 + gdom_info = gdom_info_load_or_alloc_locked(&aviommu->gdomid_array, ndom->gdom_id); 142 + if (IS_ERR(gdom_info)) { 143 + xa_unlock(&aviommu->gdomid_array); 144 + ret = PTR_ERR(gdom_info); 145 + goto out_err; 146 + } 147 + 148 + /* Check if gDomID exist */ 149 + if (refcount_inc_not_zero(&gdom_info->users)) { 150 + ndom->gdom_info = gdom_info; 151 + xa_unlock(&aviommu->gdomid_array); 152 + 153 + pr_debug("%s: Found gdom_id=%#x, hdom_id=%#x\n", 154 + __func__, ndom->gdom_id, gdom_info->hdom_id); 155 + 156 + return &ndom->domain; 157 + } 158 + 159 + /* The gDomID does not exist. We allocate new hdom_id */ 160 + gdom_info->hdom_id = amd_iommu_pdom_id_alloc(); 161 + if (gdom_info->hdom_id <= 0) { 162 + __xa_cmpxchg(&aviommu->gdomid_array, 163 + ndom->gdom_id, gdom_info, NULL, GFP_ATOMIC); 164 + xa_unlock(&aviommu->gdomid_array); 165 + ret = -ENOSPC; 166 + goto out_err_gdom_info; 167 + } 168 + 169 + ndom->gdom_info = gdom_info; 170 + refcount_set(&gdom_info->users, 1); 171 + 172 + xa_unlock(&aviommu->gdomid_array); 173 + 174 + pr_debug("%s: Allocate gdom_id=%#x, hdom_id=%#x\n", 175 + __func__, ndom->gdom_id, gdom_info->hdom_id); 176 + 177 + return &ndom->domain; 178 + 179 + out_err_gdom_info: 180 + kfree(gdom_info); 181 + out_err: 182 + kfree(ndom); 183 + return ERR_PTR(ret); 184 + } 185 + 186 + static void set_dte_nested(struct amd_iommu *iommu, struct iommu_domain *dom, 187 + struct iommu_dev_data *dev_data, struct dev_table_entry *new) 188 + { 189 + struct protection_domain *parent; 190 + struct nested_domain *ndom = to_ndomain(dom); 191 + struct iommu_hwpt_amd_guest *gdte = &ndom->gdte; 192 + struct pt_iommu_amdv1_hw_info pt_info; 193 + 194 + /* 195 + * The nest parent domain is attached during the call to the 196 + * struct iommu_ops.viommu_init(), which will be stored as part 197 + * of the struct amd_iommu_viommu.parent. 198 + */ 199 + if (WARN_ON(!ndom->viommu || !ndom->viommu->parent)) 200 + return; 201 + 202 + parent = ndom->viommu->parent; 203 + amd_iommu_make_clear_dte(dev_data, new); 204 + 205 + /* Retrieve the current pagetable info via the IOMMU PT API. */ 206 + pt_iommu_amdv1_hw_info(&parent->amdv1, &pt_info); 207 + 208 + /* 209 + * Use domain ID from nested domain to program DTE. 210 + * See amd_iommu_alloc_domain_nested(). 211 + */ 212 + amd_iommu_set_dte_v1(dev_data, parent, ndom->gdom_info->hdom_id, 213 + &pt_info, new); 214 + 215 + /* GV is required for nested page table */ 216 + new->data[0] |= DTE_FLAG_GV; 217 + 218 + /* Guest PPR */ 219 + new->data[0] |= gdte->dte[0] & DTE_FLAG_PPR; 220 + 221 + /* Guest translation stuff */ 222 + new->data[0] |= gdte->dte[0] & (DTE_GLX | DTE_FLAG_GIOV); 223 + 224 + /* GCR3 table */ 225 + new->data[0] |= gdte->dte[0] & DTE_GCR3_14_12; 226 + new->data[1] |= gdte->dte[1] & (DTE_GCR3_30_15 | DTE_GCR3_51_31); 227 + 228 + /* Guest paging mode */ 229 + new->data[2] |= gdte->dte[2] & DTE_GPT_LEVEL_MASK; 230 + } 231 + 232 + static int nested_attach_device(struct iommu_domain *dom, struct device *dev, 233 + struct iommu_domain *old) 234 + { 235 + struct dev_table_entry new = {0}; 236 + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 237 + struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 238 + int ret = 0; 239 + 240 + /* 241 + * Needs to make sure PASID is not enabled 242 + * for this attach path. 243 + */ 244 + if (WARN_ON(dev_data->pasid_enabled)) 245 + return -EINVAL; 246 + 247 + mutex_lock(&dev_data->mutex); 248 + 249 + set_dte_nested(iommu, dom, dev_data, &new); 250 + 251 + amd_iommu_update_dte(iommu, dev_data, &new); 252 + 253 + mutex_unlock(&dev_data->mutex); 254 + 255 + return ret; 256 + } 257 + 258 + static void nested_domain_free(struct iommu_domain *dom) 259 + { 260 + struct guest_domain_mapping_info *curr; 261 + struct nested_domain *ndom = to_ndomain(dom); 262 + struct amd_iommu_viommu *aviommu = ndom->viommu; 263 + 264 + xa_lock(&aviommu->gdomid_array); 265 + 266 + if (!refcount_dec_and_test(&ndom->gdom_info->users)) { 267 + xa_unlock(&aviommu->gdomid_array); 268 + return; 269 + } 270 + 271 + /* 272 + * The refcount for the gdom_id to hdom_id mapping is zero. 273 + * It is now safe to remove the mapping. 274 + */ 275 + curr = __xa_cmpxchg(&aviommu->gdomid_array, ndom->gdom_id, 276 + ndom->gdom_info, NULL, GFP_ATOMIC); 277 + 278 + xa_unlock(&aviommu->gdomid_array); 279 + if (WARN_ON(!curr || xa_err(curr))) 280 + return; 281 + 282 + /* success */ 283 + pr_debug("%s: Free gdom_id=%#x, hdom_id=%#x\n", 284 + __func__, ndom->gdom_id, curr->hdom_id); 285 + 286 + amd_iommu_pdom_id_free(ndom->gdom_info->hdom_id); 287 + kfree(curr); 288 + kfree(ndom); 289 + } 290 + 291 + static const struct iommu_domain_ops nested_domain_ops = { 292 + .attach_dev = nested_attach_device, 293 + .free = nested_domain_free, 294 + };

-1

drivers/iommu/arm/Kconfig

··· 121 121 122 122 config TEGRA241_CMDQV 123 123 bool "NVIDIA Tegra241 CMDQ-V extension support for ARM SMMUv3" 124 - depends on ACPI 125 124 help 126 125 Support for NVIDIA CMDQ-Virtualization extension for ARM SMMUv3. The 127 126 CMDQ-V extension is similar to v3.3 ECMDQ for multi command queues

+3 -1

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c

··· 177 177 * config bit here base this off the EATS value in the STE. If the EATS 178 178 * is set then the VM must generate ATC flushes. 179 179 */ 180 - state.disable_ats = !nested_domain->enable_ats; 180 + if (FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(nested_domain->ste[0])) == 181 + STRTAB_STE_0_CFG_S1_TRANS) 182 + state.disable_ats = !nested_domain->enable_ats; 181 183 ret = arm_smmu_attach_prepare(&state, domain); 182 184 if (ret) { 183 185 mutex_unlock(&arm_smmu_asid_lock);

+75 -3

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c

··· 33 33 enum arm_smmu_test_master_feat { 34 34 ARM_SMMU_MASTER_TEST_ATS = BIT(0), 35 35 ARM_SMMU_MASTER_TEST_STALL = BIT(1), 36 + ARM_SMMU_MASTER_TEST_NESTED = BIT(2), 36 37 }; 38 + 39 + static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste, 40 + enum arm_smmu_test_master_feat feat); 37 41 38 42 static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry, 39 43 const __le64 *used_bits, 40 44 const __le64 *target, 45 + const __le64 *safe, 41 46 unsigned int length) 42 47 { 43 48 bool differs = false; 44 49 unsigned int i; 45 50 46 51 for (i = 0; i < length; i++) { 47 - if ((entry[i] & used_bits[i]) != target[i]) 52 + __le64 used = used_bits[i] & ~safe[i]; 53 + 54 + if ((entry[i] & used) != (target[i] & used)) 48 55 differs = true; 49 56 } 50 57 return differs; ··· 63 56 struct arm_smmu_test_writer *test_writer = 64 57 container_of(writer, struct arm_smmu_test_writer, writer); 65 58 __le64 *entry_used_bits; 59 + __le64 *safe_target; 60 + __le64 *safe_init; 66 61 67 62 entry_used_bits = kunit_kzalloc( 68 63 test_writer->test, sizeof(*entry_used_bits) * NUM_ENTRY_QWORDS, 69 64 GFP_KERNEL); 70 65 KUNIT_ASSERT_NOT_NULL(test_writer->test, entry_used_bits); 66 + 67 + safe_target = kunit_kzalloc(test_writer->test, 68 + sizeof(*safe_target) * NUM_ENTRY_QWORDS, 69 + GFP_KERNEL); 70 + KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_target); 71 + 72 + safe_init = kunit_kzalloc(test_writer->test, 73 + sizeof(*safe_init) * NUM_ENTRY_QWORDS, 74 + GFP_KERNEL); 75 + KUNIT_ASSERT_NOT_NULL(test_writer->test, safe_init); 71 76 72 77 pr_debug("STE value is now set to: "); 73 78 print_hex_dump_debug(" ", DUMP_PREFIX_NONE, 16, 8, ··· 98 79 * configuration. 99 80 */ 100 81 writer->ops->get_used(test_writer->entry, entry_used_bits); 82 + if (writer->ops->get_update_safe) 83 + writer->ops->get_update_safe(test_writer->entry, 84 + test_writer->init_entry, 85 + safe_init); 86 + if (writer->ops->get_update_safe) 87 + writer->ops->get_update_safe(test_writer->entry, 88 + test_writer->target_entry, 89 + safe_target); 101 90 KUNIT_EXPECT_FALSE( 102 91 test_writer->test, 103 92 arm_smmu_entry_differs_in_used_bits( 104 93 test_writer->entry, entry_used_bits, 105 - test_writer->init_entry, NUM_ENTRY_QWORDS) && 94 + test_writer->init_entry, safe_init, 95 + NUM_ENTRY_QWORDS) && 106 96 arm_smmu_entry_differs_in_used_bits( 107 97 test_writer->entry, entry_used_bits, 108 - test_writer->target_entry, 98 + test_writer->target_entry, safe_target, 109 99 NUM_ENTRY_QWORDS)); 110 100 } 111 101 } ··· 134 106 static const struct arm_smmu_entry_writer_ops test_ste_ops = { 135 107 .sync = arm_smmu_test_writer_record_syncs, 136 108 .get_used = arm_smmu_get_ste_used, 109 + .get_update_safe = arm_smmu_get_ste_update_safe, 137 110 }; 138 111 139 112 static const struct arm_smmu_entry_writer_ops test_cd_ops = { ··· 214 185 }; 215 186 216 187 arm_smmu_make_cdtable_ste(ste, &master, ats_enabled, s1dss); 188 + if (feat & ARM_SMMU_MASTER_TEST_NESTED) { 189 + struct arm_smmu_ste s2ste; 190 + int i; 191 + 192 + arm_smmu_test_make_s2_ste(&s2ste, 193 + feat & ~ARM_SMMU_MASTER_TEST_NESTED); 194 + ste->data[0] |= cpu_to_le64( 195 + FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_NESTED)); 196 + ste->data[1] |= cpu_to_le64(STRTAB_STE_1_MEV); 197 + for (i = 2; i < NUM_ENTRY_QWORDS; i++) 198 + ste->data[i] = s2ste.data[i]; 199 + } 217 200 } 218 201 219 202 static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test) ··· 583 542 NUM_EXPECTED_SYNCS(3)); 584 543 } 585 544 545 + static void 546 + arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass(struct kunit *test) 547 + { 548 + struct arm_smmu_ste s1_ste; 549 + struct arm_smmu_ste s2_ste; 550 + 551 + arm_smmu_test_make_cdtable_ste( 552 + &s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr, 553 + ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED); 554 + arm_smmu_test_make_s2_ste(&s2_ste, 0); 555 + /* Expect an additional sync to unset ignored bits: EATS and MEV */ 556 + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste, 557 + NUM_EXPECTED_SYNCS(3)); 558 + } 559 + 560 + static void 561 + arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass(struct kunit *test) 562 + { 563 + struct arm_smmu_ste s1_ste; 564 + struct arm_smmu_ste s2_ste; 565 + 566 + arm_smmu_test_make_cdtable_ste( 567 + &s1_ste, STRTAB_STE_1_S1DSS_BYPASS, fake_cdtab_dma_addr, 568 + ARM_SMMU_MASTER_TEST_ATS | ARM_SMMU_MASTER_TEST_NESTED); 569 + arm_smmu_test_make_s2_ste(&s2_ste, 0); 570 + arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste, 571 + NUM_EXPECTED_SYNCS(2)); 572 + } 573 + 586 574 static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test) 587 575 { 588 576 struct arm_smmu_cd cd = {}; ··· 658 588 KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid), 659 589 KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2_stall), 660 590 KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1_stall), 591 + KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1dssbypass_to_s1bypass), 592 + KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass), 661 593 KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear), 662 594 KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release), 663 595 {},

+120 -33

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c

··· 487 487 */ 488 488 static void arm_smmu_cmdq_shared_lock(struct arm_smmu_cmdq *cmdq) 489 489 { 490 - int val; 491 - 492 490 /* 493 - * We can try to avoid the cmpxchg() loop by simply incrementing the 494 - * lock counter. When held in exclusive state, the lock counter is set 495 - * to INT_MIN so these increments won't hurt as the value will remain 496 - * negative. 491 + * When held in exclusive state, the lock counter is set to INT_MIN 492 + * so these increments won't hurt as the value will remain negative. 493 + * The increment will also signal the exclusive locker that there are 494 + * shared waiters. 497 495 */ 498 496 if (atomic_fetch_inc_relaxed(&cmdq->lock) >= 0) 499 497 return; 500 498 501 - do { 502 - val = atomic_cond_read_relaxed(&cmdq->lock, VAL >= 0); 503 - } while (atomic_cmpxchg_relaxed(&cmdq->lock, val, val + 1) != val); 499 + /* 500 + * Someone else is holding the lock in exclusive state, so wait 501 + * for them to finish. Since we already incremented the lock counter, 502 + * no exclusive lock can be acquired until we finish. We don't need 503 + * the return value since we only care that the exclusive lock is 504 + * released (i.e. the lock counter is non-negative). 505 + * Once the exclusive locker releases the lock, the sign bit will 506 + * be cleared and our increment will make the lock counter positive, 507 + * allowing us to proceed. 508 + */ 509 + atomic_cond_read_relaxed(&cmdq->lock, VAL > 0); 504 510 } 505 511 506 512 static void arm_smmu_cmdq_shared_unlock(struct arm_smmu_cmdq *cmdq) ··· 533 527 __ret; \ 534 528 }) 535 529 530 + /* 531 + * Only clear the sign bit when releasing the exclusive lock this will 532 + * allow any shared_lock() waiters to proceed without the possibility 533 + * of entering the exclusive lock in a tight loop. 534 + */ 536 535 #define arm_smmu_cmdq_exclusive_unlock_irqrestore(cmdq, flags) \ 537 536 ({ \ 538 - atomic_set_release(&cmdq->lock, 0); \ 537 + atomic_fetch_andnot_release(INT_MIN, &cmdq->lock); \ 539 538 local_irq_restore(flags); \ 540 539 }) 541 540 ··· 1093 1082 } 1094 1083 EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_used); 1095 1084 1085 + VISIBLE_IF_KUNIT 1086 + void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target, 1087 + __le64 *safe_bits) 1088 + { 1089 + const __le64 eats_s1chk = 1090 + FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_S1CHK); 1091 + const __le64 eats_trans = 1092 + FIELD_PREP(STRTAB_STE_1_EATS, STRTAB_STE_1_EATS_TRANS); 1093 + 1094 + /* 1095 + * When an STE changes EATS_TRANS, the sequencing code in the attach 1096 + * logic already will have the PCI cap for ATS disabled. Thus at this 1097 + * moment we can expect that the device will not generate ATS queries 1098 + * and so we don't care about the sequencing of EATS. The purpose of 1099 + * EATS_TRANS is to protect the system from hostile untrusted devices 1100 + * that issue ATS when the PCI config space is disabled. However, if 1101 + * EATS_TRANS is being changed, then we must have already trusted the 1102 + * device as the EATS_TRANS security block is being disabled. 1103 + * 1104 + * Note: now the EATS_TRANS update is moved to the first entry_set(). 1105 + * Changing S2S and EATS might transiently result in S2S=1 and EATS=1 1106 + * which is a bad STE (see "5.2 Stream Table Entry"). In such a case, 1107 + * we can't do a hitless update. Also, it should not be added to the 1108 + * safe bits with STRTAB_STE_1_EATS_S1CHK, because EATS=0b11 would be 1109 + * effectively an errant 0b00 configuration. 1110 + */ 1111 + if (!((cur[1] | target[1]) & cpu_to_le64(eats_s1chk)) && 1112 + !((cur[2] | target[2]) & cpu_to_le64(STRTAB_STE_2_S2S))) 1113 + safe_bits[1] |= cpu_to_le64(eats_trans); 1114 + 1115 + /* 1116 + * MEV does not meaningfully impact the operation of the HW, it only 1117 + * changes how many fault events are generated, thus we can relax it 1118 + * when computing the ordering. The spec notes the device can act like 1119 + * MEV=1 anyhow: 1120 + * 1121 + * Note: Software must expect, and be able to deal with, coalesced 1122 + * fault records even when MEV == 0. 1123 + */ 1124 + safe_bits[1] |= cpu_to_le64(STRTAB_STE_1_MEV); 1125 + } 1126 + EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_update_safe); 1127 + 1096 1128 /* 1097 1129 * Figure out if we can do a hitless update of entry to become target. Returns a 1098 1130 * bit mask where 1 indicates that qword needs to be set disruptively. ··· 1148 1094 { 1149 1095 __le64 target_used[NUM_ENTRY_QWORDS] = {}; 1150 1096 __le64 cur_used[NUM_ENTRY_QWORDS] = {}; 1097 + __le64 safe[NUM_ENTRY_QWORDS] = {}; 1151 1098 u8 used_qword_diff = 0; 1152 1099 unsigned int i; 1153 1100 1154 1101 writer->ops->get_used(entry, cur_used); 1155 1102 writer->ops->get_used(target, target_used); 1103 + if (writer->ops->get_update_safe) 1104 + writer->ops->get_update_safe(entry, target, safe); 1156 1105 1157 1106 for (i = 0; i != NUM_ENTRY_QWORDS; i++) { 1107 + /* 1108 + * Safe is only used for bits that are used by both entries, 1109 + * otherwise it is sequenced according to the unused entry. 1110 + */ 1111 + safe[i] &= target_used[i] & cur_used[i]; 1112 + 1158 1113 /* 1159 1114 * Check that masks are up to date, the make functions are not 1160 1115 * allowed to set a bit to 1 if the used function doesn't say it ··· 1172 1109 WARN_ON_ONCE(target[i] & ~target_used[i]); 1173 1110 1174 1111 /* Bits can change because they are not currently being used */ 1112 + cur_used[i] &= ~safe[i]; 1175 1113 unused_update[i] = (entry[i] & cur_used[i]) | 1176 1114 (target[i] & ~cur_used[i]); 1177 1115 /* ··· 1185 1121 return used_qword_diff; 1186 1122 } 1187 1123 1188 - static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, 1124 + static void entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry, 1189 1125 const __le64 *target, unsigned int start, 1190 1126 unsigned int len) 1191 1127 { ··· 1201 1137 1202 1138 if (changed) 1203 1139 writer->ops->sync(writer); 1204 - return changed; 1205 1140 } 1206 1141 1207 1142 /* ··· 1270 1207 entry_set(writer, entry, target, 0, 1); 1271 1208 } else { 1272 1209 /* 1273 - * No inuse bit changed. Sanity check that all unused bits are 0 1274 - * in the entry. The target was already sanity checked by 1275 - * compute_qword_diff(). 1210 + * No inuse bit changed, though safe bits may have changed. 1276 1211 */ 1277 - WARN_ON_ONCE( 1278 - entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS)); 1212 + entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS); 1279 1213 } 1280 1214 } 1281 1215 EXPORT_SYMBOL_IF_KUNIT(arm_smmu_write_entry); ··· 1603 1543 static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = { 1604 1544 .sync = arm_smmu_ste_writer_sync_entry, 1605 1545 .get_used = arm_smmu_get_ste_used, 1546 + .get_update_safe = arm_smmu_get_ste_update_safe, 1606 1547 }; 1607 1548 1608 1549 static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid, ··· 2612 2551 ARM_SMMU_FEAT_VAX) ? 52 : 48; 2613 2552 2614 2553 pgtbl_cfg.ias = min_t(unsigned long, ias, VA_BITS); 2615 - pgtbl_cfg.oas = smmu->ias; 2554 + pgtbl_cfg.oas = smmu->oas; 2616 2555 if (enable_dirty) 2617 2556 pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD; 2618 2557 fmt = ARM_64_LPAE_S1; ··· 2622 2561 case ARM_SMMU_DOMAIN_S2: 2623 2562 if (enable_dirty) 2624 2563 return -EOPNOTSUPP; 2625 - pgtbl_cfg.ias = smmu->ias; 2564 + pgtbl_cfg.ias = smmu->oas; 2626 2565 pgtbl_cfg.oas = smmu->oas; 2627 2566 fmt = ARM_64_LPAE_S2; 2628 2567 finalise_stage_fn = arm_smmu_domain_finalise_s2; ··· 3186 3125 struct arm_smmu_domain *smmu_domain, ioasid_t pasid, 3187 3126 struct arm_smmu_cd *cd, struct iommu_domain *old) 3188 3127 { 3189 - struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev); 3128 + struct iommu_domain *sid_domain = 3129 + iommu_driver_get_domain_for_dev(master->dev); 3190 3130 struct arm_smmu_attach_state state = { 3191 3131 .master = master, 3192 3132 .ssid = pasid, ··· 3253 3191 */ 3254 3192 if (!arm_smmu_ssids_in_use(&master->cd_table)) { 3255 3193 struct iommu_domain *sid_domain = 3256 - iommu_get_domain_for_dev(master->dev); 3194 + iommu_driver_get_domain_for_dev(master->dev); 3257 3195 3258 3196 if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || 3259 3197 sid_domain->type == IOMMU_DOMAIN_BLOCKED) ··· 4457 4395 } 4458 4396 4459 4397 /* We only support the AArch64 table format at present */ 4460 - switch (FIELD_GET(IDR0_TTF, reg)) { 4461 - case IDR0_TTF_AARCH32_64: 4462 - smmu->ias = 40; 4463 - fallthrough; 4464 - case IDR0_TTF_AARCH64: 4465 - break; 4466 - default: 4398 + if (!(FIELD_GET(IDR0_TTF, reg) & IDR0_TTF_AARCH64)) { 4467 4399 dev_err(smmu->dev, "AArch64 table format not supported!\n"); 4468 4400 return -ENXIO; 4469 4401 } ··· 4570 4514 dev_warn(smmu->dev, 4571 4515 "failed to set DMA mask for table walker\n"); 4572 4516 4573 - smmu->ias = max(smmu->ias, smmu->oas); 4574 - 4575 4517 if ((smmu->features & ARM_SMMU_FEAT_TRANS_S1) && 4576 4518 (smmu->features & ARM_SMMU_FEAT_TRANS_S2)) 4577 4519 smmu->features |= ARM_SMMU_FEAT_NESTING; ··· 4579 4525 if (arm_smmu_sva_supported(smmu)) 4580 4526 smmu->features |= ARM_SMMU_FEAT_SVA; 4581 4527 4582 - dev_info(smmu->dev, "ias %lu-bit, oas %lu-bit (features 0x%08x)\n", 4583 - smmu->ias, smmu->oas, smmu->features); 4528 + dev_info(smmu->dev, "oas %lu-bit (features 0x%08x)\n", 4529 + smmu->oas, smmu->features); 4584 4530 return 0; 4585 4531 } 4532 + 4533 + #ifdef CONFIG_TEGRA241_CMDQV 4534 + static void tegra_cmdqv_dt_probe(struct device_node *smmu_node, 4535 + struct arm_smmu_device *smmu) 4536 + { 4537 + struct platform_device *pdev; 4538 + struct device_node *np; 4539 + 4540 + np = of_parse_phandle(smmu_node, "nvidia,cmdqv", 0); 4541 + if (!np) 4542 + return; 4543 + 4544 + /* Tegra241 CMDQV driver is responsible for put_device() */ 4545 + pdev = of_find_device_by_node(np); 4546 + of_node_put(np); 4547 + if (!pdev) 4548 + return; 4549 + 4550 + smmu->impl_dev = &pdev->dev; 4551 + smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV; 4552 + dev_dbg(smmu->dev, "found companion CMDQV device: %s\n", 4553 + dev_name(smmu->impl_dev)); 4554 + } 4555 + #else 4556 + static void tegra_cmdqv_dt_probe(struct device_node *smmu_node, 4557 + struct arm_smmu_device *smmu) 4558 + { 4559 + } 4560 + #endif 4586 4561 4587 4562 #ifdef CONFIG_ACPI 4588 4563 #ifdef CONFIG_TEGRA241_CMDQV ··· 4625 4542 adev = acpi_dev_get_first_match_dev("NVDA200C", uid, -1); 4626 4543 if (adev) { 4627 4544 /* Tegra241 CMDQV driver is responsible for put_device() */ 4628 - smmu->impl_dev = &adev->dev; 4545 + smmu->impl_dev = get_device(acpi_get_first_physical_node(adev)); 4629 4546 smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV; 4630 4547 dev_info(smmu->dev, "found companion CMDQV device: %s\n", 4631 4548 dev_name(smmu->impl_dev)); 4549 + acpi_dev_put(adev); 4632 4550 } 4633 4551 kfree(uid); 4634 4552 } ··· 4717 4633 4718 4634 if (of_dma_is_coherent(dev->of_node)) 4719 4635 smmu->features |= ARM_SMMU_FEAT_COHERENCY; 4636 + 4637 + if (of_device_is_compatible(dev->of_node, "nvidia,tegra264-smmu")) 4638 + tegra_cmdqv_dt_probe(dev->of_node, smmu); 4720 4639 4721 4640 return ret; 4722 4641 }

+4 -2

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h

··· 43 43 #define IDR0_COHACC (1 << 4) 44 44 #define IDR0_TTF GENMASK(3, 2) 45 45 #define IDR0_TTF_AARCH64 2 46 - #define IDR0_TTF_AARCH32_64 3 47 46 #define IDR0_S1P (1 << 1) 48 47 #define IDR0_S2P (1 << 0) 49 48 ··· 783 784 int gerr_irq; 784 785 int combined_irq; 785 786 786 - unsigned long ias; /* IPA */ 787 787 unsigned long oas; /* PA */ 788 788 unsigned long pgsize_bitmap; 789 789 ··· 898 900 899 901 struct arm_smmu_entry_writer_ops { 900 902 void (*get_used)(const __le64 *entry, __le64 *used); 903 + void (*get_update_safe)(const __le64 *cur, const __le64 *target, 904 + __le64 *safe_bits); 901 905 void (*sync)(struct arm_smmu_entry_writer *writer); 902 906 }; 903 907 ··· 911 911 912 912 #if IS_ENABLED(CONFIG_KUNIT) 913 913 void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits); 914 + void arm_smmu_get_ste_update_safe(const __le64 *cur, const __le64 *target, 915 + __le64 *safe_bits); 914 916 void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur, 915 917 const __le64 *target); 916 918 void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);

+12 -72

drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c

··· 3 3 4 4 #define dev_fmt(fmt) "tegra241_cmdqv: " fmt 5 5 6 - #include <linux/acpi.h> 7 6 #include <linux/debugfs.h> 8 7 #include <linux/dma-mapping.h> 9 8 #include <linux/interrupt.h> 10 9 #include <linux/iommu.h> 11 10 #include <linux/iommufd.h> 12 11 #include <linux/iopoll.h> 12 + #include <linux/platform_device.h> 13 13 #include <uapi/linux/iommufd.h> 14 - 15 - #include <acpi/acpixf.h> 16 14 17 15 #include "arm-smmu-v3.h" 18 16 ··· 852 854 853 855 /* Probe Functions */ 854 856 855 - static int tegra241_cmdqv_acpi_is_memory(struct acpi_resource *res, void *data) 856 - { 857 - struct resource_win win; 858 - 859 - return !acpi_dev_resource_address_space(res, &win); 860 - } 861 - 862 - static int tegra241_cmdqv_acpi_get_irqs(struct acpi_resource *ares, void *data) 863 - { 864 - struct resource r; 865 - int *irq = data; 866 - 867 - if (*irq <= 0 && acpi_dev_resource_interrupt(ares, 0, &r)) 868 - *irq = r.start; 869 - return 1; /* No need to add resource to the list */ 870 - } 871 - 872 - static struct resource * 873 - tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq) 874 - { 875 - struct acpi_device *adev = to_acpi_device(dev); 876 - struct list_head resource_list; 877 - struct resource_entry *rentry; 878 - struct resource *res = NULL; 879 - int ret; 880 - 881 - INIT_LIST_HEAD(&resource_list); 882 - ret = acpi_dev_get_resources(adev, &resource_list, 883 - tegra241_cmdqv_acpi_is_memory, NULL); 884 - if (ret < 0) { 885 - dev_err(dev, "failed to get memory resource: %d\n", ret); 886 - return NULL; 887 - } 888 - 889 - rentry = list_first_entry_or_null(&resource_list, 890 - struct resource_entry, node); 891 - if (!rentry) { 892 - dev_err(dev, "failed to get memory resource entry\n"); 893 - goto free_list; 894 - } 895 - 896 - /* Caller must free the res */ 897 - res = kzalloc(sizeof(*res), GFP_KERNEL); 898 - if (!res) 899 - goto free_list; 900 - 901 - *res = *rentry->res; 902 - 903 - acpi_dev_free_resource_list(&resource_list); 904 - 905 - INIT_LIST_HEAD(&resource_list); 906 - 907 - if (irq) 908 - ret = acpi_dev_get_resources(adev, &resource_list, 909 - tegra241_cmdqv_acpi_get_irqs, irq); 910 - if (ret < 0 || !irq || *irq <= 0) 911 - dev_warn(dev, "no interrupt. errors will not be reported\n"); 912 - 913 - free_list: 914 - acpi_dev_free_resource_list(&resource_list); 915 - return res; 916 - } 917 - 918 857 static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu) 919 858 { 920 859 struct tegra241_cmdqv *cmdqv = ··· 977 1042 978 1043 struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu) 979 1044 { 1045 + struct platform_device *pdev = to_platform_device(smmu->impl_dev); 980 1046 struct arm_smmu_device *new_smmu; 981 - struct resource *res = NULL; 1047 + struct resource *res; 982 1048 int irq; 983 1049 984 - if (!smmu->dev->of_node) 985 - res = tegra241_cmdqv_find_acpi_resource(smmu->impl_dev, &irq); 986 - if (!res) 1050 + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); 1051 + if (!res) { 1052 + dev_err(&pdev->dev, "no memory resource found for CMDQV\n"); 987 1053 goto out_fallback; 1054 + } 1055 + 1056 + irq = platform_get_irq_optional(pdev, 0); 1057 + if (irq <= 0) 1058 + dev_warn(&pdev->dev, 1059 + "no interrupt. errors will not be reported\n"); 988 1060 989 1061 new_smmu = __tegra241_cmdqv_probe(smmu, res, irq); 990 - kfree(res); 991 - 992 1062 if (new_smmu) 993 1063 return new_smmu; 994 1064

+28

drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c

··· 41 41 .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, 42 42 { .compatible = "qcom,fastrpc", 43 43 .data = (const void *) (PREFETCH_DEEP | CPRE | CMTLB) }, 44 + { .compatible = "qcom,qcm2290-mdss", 45 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 46 + { .compatible = "qcom,sa8775p-mdss", 47 + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, 44 48 { .compatible = "qcom,sc7280-mdss", 45 49 .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 46 50 { .compatible = "qcom,sc7280-venus", 47 51 .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 52 + { .compatible = "qcom,sc8180x-mdss", 53 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 54 + { .compatible = "qcom,sc8280xp-mdss", 55 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 56 + { .compatible = "qcom,sm6115-mdss", 57 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 58 + { .compatible = "qcom,sm6125-mdss", 59 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 60 + { .compatible = "qcom,sm6350-mdss", 61 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 62 + { .compatible = "qcom,sm8150-mdss", 63 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 64 + { .compatible = "qcom,sm8250-mdss", 65 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 66 + { .compatible = "qcom,sm8350-mdss", 67 + .data = (const void *) (PREFETCH_SHALLOW | CPRE | CMTLB) }, 68 + { .compatible = "qcom,sm8450-mdss", 69 + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, 48 70 { .compatible = "qcom,sm8550-mdss", 71 + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, 72 + { .compatible = "qcom,sm8650-mdss", 73 + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, 74 + { .compatible = "qcom,sm8750-mdss", 75 + .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, 76 + { .compatible = "qcom,x1e80100-mdss", 49 77 .data = (const void *) (PREFETCH_DEFAULT | CMTLB) }, 50 78 { } 51 79 };

+2 -6

drivers/iommu/arm/arm-smmu/qcom_iommu.c

··· 761 761 762 762 static bool qcom_iommu_has_secure_context(struct qcom_iommu_dev *qcom_iommu) 763 763 { 764 - struct device_node *child; 765 - 766 - for_each_child_of_node(qcom_iommu->dev->of_node, child) { 764 + for_each_child_of_node_scoped(qcom_iommu->dev->of_node, child) { 767 765 if (of_device_is_compatible(child, "qcom,msm-iommu-v1-sec") || 768 - of_device_is_compatible(child, "qcom,msm-iommu-v2-sec")) { 769 - of_node_put(child); 766 + of_device_is_compatible(child, "qcom,msm-iommu-v2-sec")) 770 767 return true; 771 - } 772 768 } 773 769 774 770 return false;

+1 -3

drivers/iommu/dma-iommu.c

··· 2097 2097 } 2098 2098 EXPORT_SYMBOL_GPL(dma_iova_destroy); 2099 2099 2100 - void iommu_setup_dma_ops(struct device *dev) 2100 + void iommu_setup_dma_ops(struct device *dev, struct iommu_domain *domain) 2101 2101 { 2102 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 2103 - 2104 2102 if (dev_is_pci(dev)) 2105 2103 dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac; 2106 2104

+3 -2

drivers/iommu/dma-iommu.h

··· 9 9 10 10 #ifdef CONFIG_IOMMU_DMA 11 11 12 - void iommu_setup_dma_ops(struct device *dev); 12 + void iommu_setup_dma_ops(struct device *dev, struct iommu_domain *domain); 13 13 14 14 int iommu_get_dma_cookie(struct iommu_domain *domain); 15 15 void iommu_put_dma_cookie(struct iommu_domain *domain); ··· 26 26 27 27 #else /* CONFIG_IOMMU_DMA */ 28 28 29 - static inline void iommu_setup_dma_ops(struct device *dev) 29 + static inline void iommu_setup_dma_ops(struct device *dev, 30 + struct iommu_domain *domain) 30 31 { 31 32 } 32 33

+2 -1

drivers/iommu/generic_pt/fmt/amdv1.h

··· 354 354 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to 355 355 * control this. For now if the tables use sme_set then so do the ptes. 356 356 */ 357 - if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 357 + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES) && 358 + !(iommu_prot & IOMMU_MMIO)) 358 359 pte = __sme_set(pte); 359 360 360 361 attrs->descriptor_bits = pte;

+2 -1

drivers/iommu/generic_pt/fmt/x86_64.h

··· 227 227 * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to 228 228 * control this. For now if the tables use sme_set then so do the ptes. 229 229 */ 230 - if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 230 + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) && 231 + !(iommu_prot & IOMMU_MMIO)) 231 232 pte = __sme_set(pte); 232 233 233 234 attrs->descriptor_bits = pte;

+1 -2

drivers/iommu/generic_pt/iommu_pt.h

··· 58 58 * Note that the sync frees the gather's free list, so we must 59 59 * not have any pages on that list that are covered by iova/len 60 60 */ 61 - } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { 62 - iommu_iotlb_gather_add_range(iotlb_gather, iova, len); 63 61 } 64 62 63 + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); 65 64 iommu_pages_list_splice(free_list, &iotlb_gather->freelist); 66 65 } 67 66

+8 -1

drivers/iommu/intel/cache.c

··· 363 363 qi_batch_increment_index(iommu, batch); 364 364 } 365 365 366 + static bool intel_domain_use_piotlb(struct dmar_domain *domain) 367 + { 368 + return domain->domain.type == IOMMU_DOMAIN_SVA || 369 + domain->domain.type == IOMMU_DOMAIN_NESTED || 370 + intel_domain_is_fs_paging(domain); 371 + } 372 + 366 373 static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, 367 374 unsigned long addr, unsigned long pages, 368 375 unsigned long mask, int ih) ··· 377 370 struct intel_iommu *iommu = tag->iommu; 378 371 u64 type = DMA_TLB_PSI_FLUSH; 379 372 380 - if (intel_domain_is_fs_paging(domain)) { 373 + if (intel_domain_use_piotlb(domain)) { 381 374 qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, 382 375 pages, ih, domain->qi_batch); 383 376 return;

+15 -18

drivers/iommu/intel/iommu.c

··· 1240 1240 } 1241 1241 1242 1242 did = context_domain_id(context); 1243 - context_clear_entry(context); 1243 + context_clear_present(context); 1244 1244 __iommu_flush_cache(iommu, context, sizeof(*context)); 1245 1245 spin_unlock(&iommu->lock); 1246 1246 intel_context_flush_no_pasid(info, context, did); 1247 + context_clear_entry(context); 1248 + __iommu_flush_cache(iommu, context, sizeof(*context)); 1247 1249 } 1248 1250 1249 1251 int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev, 1250 1252 ioasid_t pasid, u16 did, phys_addr_t fsptptr, 1251 1253 int flags, struct iommu_domain *old) 1252 1254 { 1253 - if (!old) 1254 - return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, 1255 - did, flags); 1256 - return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did, 1257 - iommu_domain_did(old, iommu), 1258 - flags); 1255 + if (old) 1256 + intel_pasid_tear_down_entry(iommu, dev, pasid, false); 1257 + 1258 + return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid, did, flags); 1259 1259 } 1260 1260 1261 1261 static int domain_setup_second_level(struct intel_iommu *iommu, ··· 1263 1263 struct device *dev, ioasid_t pasid, 1264 1264 struct iommu_domain *old) 1265 1265 { 1266 - if (!old) 1267 - return intel_pasid_setup_second_level(iommu, domain, 1268 - dev, pasid); 1269 - return intel_pasid_replace_second_level(iommu, domain, dev, 1270 - iommu_domain_did(old, iommu), 1271 - pasid); 1266 + if (old) 1267 + intel_pasid_tear_down_entry(iommu, dev, pasid, false); 1268 + 1269 + return intel_pasid_setup_second_level(iommu, domain, dev, pasid); 1272 1270 } 1273 1271 1274 1272 static int domain_setup_passthrough(struct intel_iommu *iommu, 1275 1273 struct device *dev, ioasid_t pasid, 1276 1274 struct iommu_domain *old) 1277 1275 { 1278 - if (!old) 1279 - return intel_pasid_setup_pass_through(iommu, dev, pasid); 1280 - return intel_pasid_replace_pass_through(iommu, dev, 1281 - iommu_domain_did(old, iommu), 1282 - pasid); 1276 + if (old) 1277 + intel_pasid_tear_down_entry(iommu, dev, pasid, false); 1278 + 1279 + return intel_pasid_setup_pass_through(iommu, dev, pasid); 1283 1280 } 1284 1281 1285 1282 static int domain_setup_first_level(struct intel_iommu *iommu,

+20 -1

drivers/iommu/intel/iommu.h

··· 900 900 901 901 static inline void context_set_present(struct context_entry *context) 902 902 { 903 - context->lo |= 1; 903 + u64 val; 904 + 905 + dma_wmb(); 906 + val = READ_ONCE(context->lo) | 1; 907 + WRITE_ONCE(context->lo, val); 908 + } 909 + 910 + /* 911 + * Clear the Present (P) bit (bit 0) of a context table entry. This initiates 912 + * the transition of the entry's ownership from hardware to software. The 913 + * caller is responsible for fulfilling the invalidation handshake recommended 914 + * by the VT-d spec, Section 6.5.3.3 (Guidance to Software for Invalidations). 915 + */ 916 + static inline void context_clear_present(struct context_entry *context) 917 + { 918 + u64 val; 919 + 920 + val = READ_ONCE(context->lo) & GENMASK_ULL(63, 1); 921 + WRITE_ONCE(context->lo, val); 922 + dma_wmb(); 904 923 } 905 924 906 925 static inline void context_set_fault_enable(struct context_entry *context)

+4 -5

drivers/iommu/intel/nested.c

··· 136 136 struct device *dev, ioasid_t pasid, 137 137 struct iommu_domain *old) 138 138 { 139 - if (!old) 140 - return intel_pasid_setup_nested(iommu, dev, pasid, domain); 141 - return intel_pasid_replace_nested(iommu, dev, pasid, 142 - iommu_domain_did(old, iommu), 143 - domain); 139 + if (old) 140 + intel_pasid_tear_down_entry(iommu, dev, pasid, false); 141 + 142 + return intel_pasid_setup_nested(iommu, dev, pasid, domain); 144 143 } 145 144 146 145 static int intel_nested_set_dev_pasid(struct iommu_domain *domain,

+22 -190

drivers/iommu/intel/pasid.c

··· 153 153 if (!entries) 154 154 return NULL; 155 155 156 + if (!ecap_coherent(info->iommu->ecap)) 157 + clflush_cache_range(entries, VTD_PAGE_SIZE); 158 + 156 159 /* 157 160 * The pasid directory table entry won't be freed after 158 161 * allocation. No worry about the race with free and ··· 168 165 iommu_free_pages(entries); 169 166 goto retry; 170 167 } 171 - if (!ecap_coherent(info->iommu->ecap)) { 172 - clflush_cache_range(entries, VTD_PAGE_SIZE); 168 + if (!ecap_coherent(info->iommu->ecap)) 173 169 clflush_cache_range(&dir[dir_index].val, sizeof(*dir)); 174 - } 175 170 } 176 171 177 172 return &entries[index]; ··· 219 218 if (!info || !info->ats_enabled) 220 219 return; 221 220 222 - if (pci_dev_is_disconnected(to_pci_dev(dev))) 221 + if (!pci_device_is_present(to_pci_dev(dev))) 223 222 return; 224 223 225 224 sid = PCI_DEVID(info->bus, info->devfn); ··· 273 272 274 273 did = pasid_get_domain_id(pte); 275 274 pgtt = pasid_pte_get_pgtt(pte); 276 - intel_pasid_clear_entry(dev, pasid, fault_ignore); 275 + pasid_clear_present(pte); 277 276 spin_unlock(&iommu->lock); 278 277 279 278 if (!ecap_coherent(iommu->ecap)) ··· 287 286 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 288 287 289 288 devtlb_invalidation_with_pasid(iommu, dev, pasid); 289 + intel_pasid_clear_entry(dev, pasid, fault_ignore); 290 + if (!ecap_coherent(iommu->ecap)) 291 + clflush_cache_range(pte, sizeof(*pte)); 292 + 290 293 if (!fault_ignore) 291 294 intel_iommu_drain_pasid_prq(dev, pasid); 292 295 } ··· 417 412 return 0; 418 413 } 419 414 420 - int intel_pasid_replace_first_level(struct intel_iommu *iommu, 421 - struct device *dev, phys_addr_t fsptptr, 422 - u32 pasid, u16 did, u16 old_did, 423 - int flags) 424 - { 425 - struct pasid_entry *pte, new_pte; 426 - 427 - if (!ecap_flts(iommu->ecap)) { 428 - pr_err("No first level translation support on %s\n", 429 - iommu->name); 430 - return -EINVAL; 431 - } 432 - 433 - if ((flags & PASID_FLAG_FL5LP) && !cap_fl5lp_support(iommu->cap)) { 434 - pr_err("No 5-level paging support for first-level on %s\n", 435 - iommu->name); 436 - return -EINVAL; 437 - } 438 - 439 - pasid_pte_config_first_level(iommu, &new_pte, fsptptr, did, flags); 440 - 441 - spin_lock(&iommu->lock); 442 - pte = intel_pasid_get_entry(dev, pasid); 443 - if (!pte) { 444 - spin_unlock(&iommu->lock); 445 - return -ENODEV; 446 - } 447 - 448 - if (!pasid_pte_is_present(pte)) { 449 - spin_unlock(&iommu->lock); 450 - return -EINVAL; 451 - } 452 - 453 - WARN_ON(old_did != pasid_get_domain_id(pte)); 454 - 455 - *pte = new_pte; 456 - spin_unlock(&iommu->lock); 457 - 458 - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); 459 - intel_iommu_drain_pasid_prq(dev, pasid); 460 - 461 - return 0; 462 - } 463 - 464 415 /* 465 416 * Set up the scalable mode pasid entry for second only translation type. 466 417 */ ··· 479 518 spin_unlock(&iommu->lock); 480 519 481 520 pasid_flush_caches(iommu, pte, pasid, did); 482 - 483 - return 0; 484 - } 485 - 486 - int intel_pasid_replace_second_level(struct intel_iommu *iommu, 487 - struct dmar_domain *domain, 488 - struct device *dev, u16 old_did, 489 - u32 pasid) 490 - { 491 - struct pasid_entry *pte, new_pte; 492 - u16 did; 493 - 494 - /* 495 - * If hardware advertises no support for second level 496 - * translation, return directly. 497 - */ 498 - if (!ecap_slts(iommu->ecap)) { 499 - pr_err("No second level translation support on %s\n", 500 - iommu->name); 501 - return -EINVAL; 502 - } 503 - 504 - did = domain_id_iommu(domain, iommu); 505 - 506 - pasid_pte_config_second_level(iommu, &new_pte, domain, did); 507 - 508 - spin_lock(&iommu->lock); 509 - pte = intel_pasid_get_entry(dev, pasid); 510 - if (!pte) { 511 - spin_unlock(&iommu->lock); 512 - return -ENODEV; 513 - } 514 - 515 - if (!pasid_pte_is_present(pte)) { 516 - spin_unlock(&iommu->lock); 517 - return -EINVAL; 518 - } 519 - 520 - WARN_ON(old_did != pasid_get_domain_id(pte)); 521 - 522 - *pte = new_pte; 523 - spin_unlock(&iommu->lock); 524 - 525 - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); 526 - intel_iommu_drain_pasid_prq(dev, pasid); 527 521 528 522 return 0; 529 523 } ··· 591 675 spin_unlock(&iommu->lock); 592 676 593 677 pasid_flush_caches(iommu, pte, pasid, did); 594 - 595 - return 0; 596 - } 597 - 598 - int intel_pasid_replace_pass_through(struct intel_iommu *iommu, 599 - struct device *dev, u16 old_did, 600 - u32 pasid) 601 - { 602 - struct pasid_entry *pte, new_pte; 603 - u16 did = FLPT_DEFAULT_DID; 604 - 605 - pasid_pte_config_pass_through(iommu, &new_pte, did); 606 - 607 - spin_lock(&iommu->lock); 608 - pte = intel_pasid_get_entry(dev, pasid); 609 - if (!pte) { 610 - spin_unlock(&iommu->lock); 611 - return -ENODEV; 612 - } 613 - 614 - if (!pasid_pte_is_present(pte)) { 615 - spin_unlock(&iommu->lock); 616 - return -EINVAL; 617 - } 618 - 619 - WARN_ON(old_did != pasid_get_domain_id(pte)); 620 - 621 - *pte = new_pte; 622 - spin_unlock(&iommu->lock); 623 - 624 - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); 625 - intel_iommu_drain_pasid_prq(dev, pasid); 626 678 627 679 return 0; 628 680 } ··· 728 844 return 0; 729 845 } 730 846 731 - int intel_pasid_replace_nested(struct intel_iommu *iommu, 732 - struct device *dev, u32 pasid, 733 - u16 old_did, struct dmar_domain *domain) 734 - { 735 - struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; 736 - struct dmar_domain *s2_domain = domain->s2_domain; 737 - u16 did = domain_id_iommu(domain, iommu); 738 - struct pasid_entry *pte, new_pte; 739 - 740 - /* Address width should match the address width supported by hardware */ 741 - switch (s1_cfg->addr_width) { 742 - case ADDR_WIDTH_4LEVEL: 743 - break; 744 - case ADDR_WIDTH_5LEVEL: 745 - if (!cap_fl5lp_support(iommu->cap)) { 746 - dev_err_ratelimited(dev, 747 - "5-level paging not supported\n"); 748 - return -EINVAL; 749 - } 750 - break; 751 - default: 752 - dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", 753 - s1_cfg->addr_width); 754 - return -EINVAL; 755 - } 756 - 757 - if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { 758 - pr_err_ratelimited("No supervisor request support on %s\n", 759 - iommu->name); 760 - return -EINVAL; 761 - } 762 - 763 - if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { 764 - pr_err_ratelimited("No extended access flag support on %s\n", 765 - iommu->name); 766 - return -EINVAL; 767 - } 768 - 769 - pasid_pte_config_nestd(iommu, &new_pte, s1_cfg, s2_domain, did); 770 - 771 - spin_lock(&iommu->lock); 772 - pte = intel_pasid_get_entry(dev, pasid); 773 - if (!pte) { 774 - spin_unlock(&iommu->lock); 775 - return -ENODEV; 776 - } 777 - 778 - if (!pasid_pte_is_present(pte)) { 779 - spin_unlock(&iommu->lock); 780 - return -EINVAL; 781 - } 782 - 783 - WARN_ON(old_did != pasid_get_domain_id(pte)); 784 - 785 - *pte = new_pte; 786 - spin_unlock(&iommu->lock); 787 - 788 - intel_pasid_flush_present(iommu, dev, pasid, old_did, pte); 789 - intel_iommu_drain_pasid_prq(dev, pasid); 790 - 791 - return 0; 792 - } 793 - 794 847 /* 795 848 * Interfaces to setup or teardown a pasid table to the scalable-mode 796 849 * context table entry: ··· 840 1019 } 841 1020 842 1021 if (context_copied(iommu, bus, devfn)) { 843 - context_clear_entry(context); 1022 + context_clear_present(context); 844 1023 __iommu_flush_cache(iommu, context, sizeof(*context)); 845 1024 846 1025 /* ··· 859 1038 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0); 860 1039 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); 861 1040 devtlb_invalidation_with_pasid(iommu, dev, IOMMU_NO_PASID); 1041 + 1042 + context_clear_entry(context); 1043 + __iommu_flush_cache(iommu, context, sizeof(*context)); 862 1044 863 1045 /* 864 1046 * At this point, the device is supposed to finish reset at ··· 924 1100 static void __context_flush_dev_iotlb(struct device_domain_info *info) 925 1101 { 926 1102 if (!info->ats_enabled) 1103 + return; 1104 + 1105 + /* 1106 + * Skip dev-IOTLB flush for inaccessible PCIe devices to prevent the 1107 + * Intel IOMMU from waiting indefinitely for an ATS invalidation that 1108 + * cannot complete. 1109 + */ 1110 + if (!pci_device_is_present(to_pci_dev(info->dev))) 927 1111 return; 928 1112 929 1113 qi_flush_dev_iotlb(info->iommu, PCI_DEVID(info->bus, info->devfn),

+14 -14

drivers/iommu/intel/pasid.h

··· 234 234 */ 235 235 static inline void pasid_set_present(struct pasid_entry *pe) 236 236 { 237 + dma_wmb(); 237 238 pasid_set_bits(&pe->val[0], 1 << 0, 1); 239 + } 240 + 241 + /* 242 + * Clear the Present (P) bit (bit 0) of a scalable-mode PASID table entry. 243 + * This initiates the transition of the entry's ownership from hardware 244 + * to software. The caller is responsible for fulfilling the invalidation 245 + * handshake recommended by the VT-d spec, Section 6.5.3.3 (Guidance to 246 + * Software for Invalidations). 247 + */ 248 + static inline void pasid_clear_present(struct pasid_entry *pe) 249 + { 250 + pasid_set_bits(&pe->val[0], 1 << 0, 0); 251 + dma_wmb(); 238 252 } 239 253 240 254 /* ··· 316 302 struct device *dev, u32 pasid); 317 303 int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, 318 304 u32 pasid, struct dmar_domain *domain); 319 - int intel_pasid_replace_first_level(struct intel_iommu *iommu, 320 - struct device *dev, phys_addr_t fsptptr, 321 - u32 pasid, u16 did, u16 old_did, int flags); 322 - int intel_pasid_replace_second_level(struct intel_iommu *iommu, 323 - struct dmar_domain *domain, 324 - struct device *dev, u16 old_did, 325 - u32 pasid); 326 - int intel_pasid_replace_pass_through(struct intel_iommu *iommu, 327 - struct device *dev, u16 old_did, 328 - u32 pasid); 329 - int intel_pasid_replace_nested(struct intel_iommu *iommu, 330 - struct device *dev, u32 pasid, 331 - u16 old_did, struct dmar_domain *domain); 332 - 333 305 void intel_pasid_tear_down_entry(struct intel_iommu *iommu, 334 306 struct device *dev, u32 pasid, 335 307 bool fault_ignore);

+164

drivers/iommu/iommu-debug-pagealloc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2025 - Google Inc 4 + * Author: Mostafa Saleh <smostafa@google.com> 5 + * IOMMU API debug page alloc sanitizer 6 + */ 7 + #include <linux/atomic.h> 8 + #include <linux/iommu.h> 9 + #include <linux/iommu-debug-pagealloc.h> 10 + #include <linux/kernel.h> 11 + #include <linux/page_ext.h> 12 + #include <linux/page_owner.h> 13 + 14 + #include "iommu-priv.h" 15 + 16 + static bool needed; 17 + DEFINE_STATIC_KEY_FALSE(iommu_debug_initialized); 18 + 19 + struct iommu_debug_metadata { 20 + atomic_t ref; 21 + }; 22 + 23 + static __init bool need_iommu_debug(void) 24 + { 25 + return needed; 26 + } 27 + 28 + struct page_ext_operations page_iommu_debug_ops = { 29 + .size = sizeof(struct iommu_debug_metadata), 30 + .need = need_iommu_debug, 31 + }; 32 + 33 + static struct iommu_debug_metadata *get_iommu_data(struct page_ext *page_ext) 34 + { 35 + return page_ext_data(page_ext, &page_iommu_debug_ops); 36 + } 37 + 38 + static void iommu_debug_inc_page(phys_addr_t phys) 39 + { 40 + struct page_ext *page_ext = page_ext_from_phys(phys); 41 + struct iommu_debug_metadata *d; 42 + 43 + if (!page_ext) 44 + return; 45 + 46 + d = get_iommu_data(page_ext); 47 + WARN_ON(atomic_inc_return_relaxed(&d->ref) <= 0); 48 + page_ext_put(page_ext); 49 + } 50 + 51 + static void iommu_debug_dec_page(phys_addr_t phys) 52 + { 53 + struct page_ext *page_ext = page_ext_from_phys(phys); 54 + struct iommu_debug_metadata *d; 55 + 56 + if (!page_ext) 57 + return; 58 + 59 + d = get_iommu_data(page_ext); 60 + WARN_ON(atomic_dec_return_relaxed(&d->ref) < 0); 61 + page_ext_put(page_ext); 62 + } 63 + 64 + /* 65 + * IOMMU page size doesn't have to match the CPU page size. So, we use 66 + * the smallest IOMMU page size to refcount the pages in the vmemmap. 67 + * That is important as both map and unmap has to use the same page size 68 + * to update the refcount to avoid double counting the same page. 69 + * And as we can't know from iommu_unmap() what was the original page size 70 + * used for map, we just use the minimum supported one for both. 71 + */ 72 + static size_t iommu_debug_page_size(struct iommu_domain *domain) 73 + { 74 + return 1UL << __ffs(domain->pgsize_bitmap); 75 + } 76 + 77 + static bool iommu_debug_page_count(const struct page *page) 78 + { 79 + unsigned int ref; 80 + struct page_ext *page_ext = page_ext_get(page); 81 + struct iommu_debug_metadata *d = get_iommu_data(page_ext); 82 + 83 + ref = atomic_read(&d->ref); 84 + page_ext_put(page_ext); 85 + return ref != 0; 86 + } 87 + 88 + void __iommu_debug_check_unmapped(const struct page *page, int numpages) 89 + { 90 + while (numpages--) { 91 + if (WARN_ON(iommu_debug_page_count(page))) { 92 + pr_warn("iommu: Detected page leak!\n"); 93 + dump_page_owner(page); 94 + } 95 + page++; 96 + } 97 + } 98 + 99 + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, size_t size) 100 + { 101 + size_t off, end; 102 + size_t page_size = iommu_debug_page_size(domain); 103 + 104 + if (WARN_ON(!phys || check_add_overflow(phys, size, &end))) 105 + return; 106 + 107 + for (off = 0 ; off < size ; off += page_size) 108 + iommu_debug_inc_page(phys + off); 109 + } 110 + 111 + static void __iommu_debug_update_iova(struct iommu_domain *domain, 112 + unsigned long iova, size_t size, bool inc) 113 + { 114 + size_t off, end; 115 + size_t page_size = iommu_debug_page_size(domain); 116 + 117 + if (WARN_ON(check_add_overflow(iova, size, &end))) 118 + return; 119 + 120 + for (off = 0 ; off < size ; off += page_size) { 121 + phys_addr_t phys = iommu_iova_to_phys(domain, iova + off); 122 + 123 + if (!phys) 124 + continue; 125 + 126 + if (inc) 127 + iommu_debug_inc_page(phys); 128 + else 129 + iommu_debug_dec_page(phys); 130 + } 131 + } 132 + 133 + void __iommu_debug_unmap_begin(struct iommu_domain *domain, 134 + unsigned long iova, size_t size) 135 + { 136 + __iommu_debug_update_iova(domain, iova, size, false); 137 + } 138 + 139 + void __iommu_debug_unmap_end(struct iommu_domain *domain, 140 + unsigned long iova, size_t size, 141 + size_t unmapped) 142 + { 143 + if ((unmapped == size) || WARN_ON_ONCE(unmapped > size)) 144 + return; 145 + 146 + /* If unmap failed, re-increment the refcount. */ 147 + __iommu_debug_update_iova(domain, iova + unmapped, 148 + size - unmapped, true); 149 + } 150 + 151 + void iommu_debug_init(void) 152 + { 153 + if (!needed) 154 + return; 155 + 156 + pr_info("iommu: Debugging page allocations, expect overhead or disable iommu.debug_pagealloc"); 157 + static_branch_enable(&iommu_debug_initialized); 158 + } 159 + 160 + static int __init iommu_debug_pagealloc(char *str) 161 + { 162 + return kstrtobool(str, &needed); 163 + } 164 + early_param("iommu.debug_pagealloc", iommu_debug_pagealloc);

+58

drivers/iommu/iommu-priv.h

··· 5 5 #define __LINUX_IOMMU_PRIV_H 6 6 7 7 #include <linux/iommu.h> 8 + #include <linux/iommu-debug-pagealloc.h> 8 9 #include <linux/msi.h> 9 10 10 11 static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) ··· 66 65 int iommu_replace_device_pasid(struct iommu_domain *domain, 67 66 struct device *dev, ioasid_t pasid, 68 67 struct iommu_attach_handle *handle); 68 + 69 + #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC 70 + 71 + void __iommu_debug_map(struct iommu_domain *domain, phys_addr_t phys, 72 + size_t size); 73 + void __iommu_debug_unmap_begin(struct iommu_domain *domain, 74 + unsigned long iova, size_t size); 75 + void __iommu_debug_unmap_end(struct iommu_domain *domain, 76 + unsigned long iova, size_t size, size_t unmapped); 77 + 78 + static inline void iommu_debug_map(struct iommu_domain *domain, 79 + phys_addr_t phys, size_t size) 80 + { 81 + if (static_branch_unlikely(&iommu_debug_initialized)) 82 + __iommu_debug_map(domain, phys, size); 83 + } 84 + 85 + static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, 86 + unsigned long iova, size_t size) 87 + { 88 + if (static_branch_unlikely(&iommu_debug_initialized)) 89 + __iommu_debug_unmap_begin(domain, iova, size); 90 + } 91 + 92 + static inline void iommu_debug_unmap_end(struct iommu_domain *domain, 93 + unsigned long iova, size_t size, 94 + size_t unmapped) 95 + { 96 + if (static_branch_unlikely(&iommu_debug_initialized)) 97 + __iommu_debug_unmap_end(domain, iova, size, unmapped); 98 + } 99 + 100 + void iommu_debug_init(void); 101 + 102 + #else 103 + static inline void iommu_debug_map(struct iommu_domain *domain, 104 + phys_addr_t phys, size_t size) 105 + { 106 + } 107 + 108 + static inline void iommu_debug_unmap_begin(struct iommu_domain *domain, 109 + unsigned long iova, size_t size) 110 + { 111 + } 112 + 113 + static inline void iommu_debug_unmap_end(struct iommu_domain *domain, 114 + unsigned long iova, size_t size, 115 + size_t unmapped) 116 + { 117 + } 118 + 119 + static inline void iommu_debug_init(void) 120 + { 121 + } 122 + 123 + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ 124 + 69 125 #endif /* __LINUX_IOMMU_PRIV_H */

+224 -10

drivers/iommu/iommu.c

··· 61 61 int id; 62 62 struct iommu_domain *default_domain; 63 63 struct iommu_domain *blocking_domain; 64 + /* 65 + * During a group device reset, @resetting_domain points to the physical 66 + * domain, while @domain points to the attached domain before the reset. 67 + */ 68 + struct iommu_domain *resetting_domain; 64 69 struct iommu_domain *domain; 65 70 struct list_head entry; 66 71 unsigned int owner_cnt; ··· 236 231 nb = kcalloc(ARRAY_SIZE(iommu_buses), sizeof(*nb), GFP_KERNEL); 237 232 if (!nb) 238 233 return -ENOMEM; 234 + 235 + iommu_debug_init(); 239 236 240 237 for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) { 241 238 nb[i].notifier_call = iommu_bus_notifier; ··· 668 661 } 669 662 670 663 if (group->default_domain) 671 - iommu_setup_dma_ops(dev); 664 + iommu_setup_dma_ops(dev, group->default_domain); 672 665 673 666 mutex_unlock(&group->mutex); 674 667 ··· 1180 1173 struct device *dev) 1181 1174 { 1182 1175 struct iommu_resv_region *entry; 1183 - struct list_head mappings; 1176 + LIST_HEAD(mappings); 1184 1177 unsigned long pg_size; 1185 1178 int ret = 0; 1186 1179 1187 1180 pg_size = domain->pgsize_bitmap ? 1UL << __ffs(domain->pgsize_bitmap) : 0; 1188 - INIT_LIST_HEAD(&mappings); 1189 1181 1190 1182 if (WARN_ON_ONCE(iommu_is_dma_domain(domain) && !pg_size)) 1191 1183 return -EINVAL; ··· 1955 1949 return ret; 1956 1950 } 1957 1951 for_each_group_device(group, gdev) 1958 - iommu_setup_dma_ops(gdev->dev); 1952 + iommu_setup_dma_ops(gdev->dev, group->default_domain); 1959 1953 mutex_unlock(&group->mutex); 1960 1954 1961 1955 /* ··· 2191 2185 2192 2186 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) 2193 2187 { 2194 - if (dev->iommu && dev->iommu->attach_deferred) 2195 - return __iommu_attach_device(domain, dev, NULL); 2188 + /* 2189 + * This is called on the dma mapping fast path so avoid locking. This is 2190 + * racy, but we have an expectation that the driver will setup its DMAs 2191 + * inside probe while being single threaded to avoid racing. 2192 + */ 2193 + if (!dev->iommu || !dev->iommu->attach_deferred) 2194 + return 0; 2196 2195 2197 - return 0; 2196 + guard(mutex)(&dev->iommu_group->mutex); 2197 + 2198 + /* 2199 + * This is a concurrent attach during a device reset. Reject it until 2200 + * pci_dev_reset_iommu_done() attaches the device to group->domain. 2201 + * 2202 + * Note that this might fail the iommu_dma_map(). But there's nothing 2203 + * more we can do here. 2204 + */ 2205 + if (dev->iommu_group->resetting_domain) 2206 + return -EBUSY; 2207 + return __iommu_attach_device(domain, dev, NULL); 2198 2208 } 2199 2209 2200 2210 void iommu_detach_device(struct iommu_domain *domain, struct device *dev) ··· 2232 2210 } 2233 2211 EXPORT_SYMBOL_GPL(iommu_detach_device); 2234 2212 2213 + /** 2214 + * iommu_get_domain_for_dev() - Return the DMA API domain pointer 2215 + * @dev: Device to query 2216 + * 2217 + * This function can be called within a driver bound to dev. The returned 2218 + * pointer is valid for the lifetime of the bound driver. 2219 + * 2220 + * It should not be called by drivers with driver_managed_dma = true. 2221 + */ 2235 2222 struct iommu_domain *iommu_get_domain_for_dev(struct device *dev) 2236 2223 { 2237 2224 /* Caller must be a probed driver on dev */ ··· 2249 2218 if (!group) 2250 2219 return NULL; 2251 2220 2221 + lockdep_assert_not_held(&group->mutex); 2222 + 2252 2223 return group->domain; 2253 2224 } 2254 2225 EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev); 2226 + 2227 + /** 2228 + * iommu_driver_get_domain_for_dev() - Return the driver-level domain pointer 2229 + * @dev: Device to query 2230 + * 2231 + * This function can be called by an iommu driver that wants to get the physical 2232 + * domain within an iommu callback function where group->mutex is held. 2233 + */ 2234 + struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev) 2235 + { 2236 + struct iommu_group *group = dev->iommu_group; 2237 + 2238 + lockdep_assert_held(&group->mutex); 2239 + 2240 + /* 2241 + * Driver handles the low-level __iommu_attach_device(), including the 2242 + * one invoked by pci_dev_reset_iommu_done() re-attaching the device to 2243 + * the cached group->domain. In this case, the driver must get the old 2244 + * domain from group->resetting_domain rather than group->domain. This 2245 + * prevents it from re-attaching the device from group->domain (old) to 2246 + * group->domain (new). 2247 + */ 2248 + if (group->resetting_domain) 2249 + return group->resetting_domain; 2250 + 2251 + return group->domain; 2252 + } 2253 + EXPORT_SYMBOL_GPL(iommu_driver_get_domain_for_dev); 2255 2254 2256 2255 /* 2257 2256 * For IOMMU_DOMAIN_DMA implementations which already provide their own ··· 2434 2373 2435 2374 if (WARN_ON(!new_domain)) 2436 2375 return -EINVAL; 2376 + 2377 + /* 2378 + * This is a concurrent attach during a device reset. Reject it until 2379 + * pci_dev_reset_iommu_done() attaches the device to group->domain. 2380 + */ 2381 + if (group->resetting_domain) 2382 + return -EBUSY; 2437 2383 2438 2384 /* 2439 2385 * Changing the domain is done by calling attach_dev() on the new ··· 2630 2562 } 2631 2563 2632 2564 /* unroll mapping in case something went wrong */ 2633 - if (ret) 2565 + if (ret) { 2634 2566 iommu_unmap(domain, orig_iova, orig_size - size); 2635 - else 2567 + } else { 2636 2568 trace_map(orig_iova, orig_paddr, orig_size); 2569 + iommu_debug_map(domain, orig_paddr, orig_size); 2570 + } 2637 2571 2638 2572 return ret; 2639 2573 } ··· 2697 2627 2698 2628 pr_debug("unmap this: iova 0x%lx size 0x%zx\n", iova, size); 2699 2629 2630 + iommu_debug_unmap_begin(domain, iova, size); 2631 + 2700 2632 /* 2701 2633 * Keep iterating until we either unmap 'size' bytes (or more) 2702 2634 * or we hit an area that isn't mapped. ··· 2719 2647 } 2720 2648 2721 2649 trace_unmap(orig_iova, size, unmapped); 2650 + iommu_debug_unmap_end(domain, orig_iova, size, unmapped); 2722 2651 return unmapped; 2723 2652 } 2724 2653 ··· 3221 3148 3222 3149 /* Make sure dma_ops is appropriatley set */ 3223 3150 for_each_group_device(group, gdev) 3224 - iommu_setup_dma_ops(gdev->dev); 3151 + iommu_setup_dma_ops(gdev->dev, group->default_domain); 3225 3152 3226 3153 out_unlock: 3227 3154 mutex_unlock(&group->mutex); ··· 3565 3492 return -EINVAL; 3566 3493 3567 3494 mutex_lock(&group->mutex); 3495 + 3496 + /* 3497 + * This is a concurrent attach during a device reset. Reject it until 3498 + * pci_dev_reset_iommu_done() attaches the device to group->domain. 3499 + */ 3500 + if (group->resetting_domain) { 3501 + ret = -EBUSY; 3502 + goto out_unlock; 3503 + } 3504 + 3568 3505 for_each_group_device(group, device) { 3569 3506 /* 3570 3507 * Skip PASID validation for devices without PASID support ··· 3658 3575 return -EINVAL; 3659 3576 3660 3577 mutex_lock(&group->mutex); 3578 + 3579 + /* 3580 + * This is a concurrent attach during a device reset. Reject it until 3581 + * pci_dev_reset_iommu_done() attaches the device to group->domain. 3582 + */ 3583 + if (group->resetting_domain) { 3584 + ret = -EBUSY; 3585 + goto out_unlock; 3586 + } 3587 + 3661 3588 entry = iommu_make_pasid_array_entry(domain, handle); 3662 3589 curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, 3663 3590 XA_ZERO_ENTRY, GFP_KERNEL); ··· 3924 3831 return ret; 3925 3832 } 3926 3833 EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); 3834 + 3835 + /** 3836 + * pci_dev_reset_iommu_prepare() - Block IOMMU to prepare for a PCI device reset 3837 + * @pdev: PCI device that is going to enter a reset routine 3838 + * 3839 + * The PCIe r6.0, sec 10.3.1 IMPLEMENTATION NOTE recommends to disable and block 3840 + * ATS before initiating a reset. This means that a PCIe device during the reset 3841 + * routine wants to block any IOMMU activity: translation and ATS invalidation. 3842 + * 3843 + * This function attaches the device's RID/PASID(s) the group->blocking_domain, 3844 + * setting the group->resetting_domain. This allows the IOMMU driver pausing any 3845 + * IOMMU activity while leaving the group->domain pointer intact. Later when the 3846 + * reset is finished, pci_dev_reset_iommu_done() can restore everything. 3847 + * 3848 + * Caller must use pci_dev_reset_iommu_prepare() with pci_dev_reset_iommu_done() 3849 + * before/after the core-level reset routine, to unset the resetting_domain. 3850 + * 3851 + * Return: 0 on success or negative error code if the preparation failed. 3852 + * 3853 + * These two functions are designed to be used by PCI reset functions that would 3854 + * not invoke any racy iommu_release_device(), since PCI sysfs node gets removed 3855 + * before it notifies with a BUS_NOTIFY_REMOVED_DEVICE. When using them in other 3856 + * case, callers must ensure there will be no racy iommu_release_device() call, 3857 + * which otherwise would UAF the dev->iommu_group pointer. 3858 + */ 3859 + int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) 3860 + { 3861 + struct iommu_group *group = pdev->dev.iommu_group; 3862 + unsigned long pasid; 3863 + void *entry; 3864 + int ret; 3865 + 3866 + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) 3867 + return 0; 3868 + 3869 + guard(mutex)(&group->mutex); 3870 + 3871 + /* Re-entry is not allowed */ 3872 + if (WARN_ON(group->resetting_domain)) 3873 + return -EBUSY; 3874 + 3875 + ret = __iommu_group_alloc_blocking_domain(group); 3876 + if (ret) 3877 + return ret; 3878 + 3879 + /* Stage RID domain at blocking_domain while retaining group->domain */ 3880 + if (group->domain != group->blocking_domain) { 3881 + ret = __iommu_attach_device(group->blocking_domain, &pdev->dev, 3882 + group->domain); 3883 + if (ret) 3884 + return ret; 3885 + } 3886 + 3887 + /* 3888 + * Stage PASID domains at blocking_domain while retaining pasid_array. 3889 + * 3890 + * The pasid_array is mostly fenced by group->mutex, except one reader 3891 + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. 3892 + */ 3893 + xa_for_each_start(&group->pasid_array, pasid, entry, 1) 3894 + iommu_remove_dev_pasid(&pdev->dev, pasid, 3895 + pasid_array_entry_to_domain(entry)); 3896 + 3897 + group->resetting_domain = group->blocking_domain; 3898 + return ret; 3899 + } 3900 + EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_prepare); 3901 + 3902 + /** 3903 + * pci_dev_reset_iommu_done() - Restore IOMMU after a PCI device reset is done 3904 + * @pdev: PCI device that has finished a reset routine 3905 + * 3906 + * After a PCIe device finishes a reset routine, it wants to restore its IOMMU 3907 + * IOMMU activity, including new translation as well as cache invalidation, by 3908 + * re-attaching all RID/PASID of the device's back to the domains retained in 3909 + * the core-level structure. 3910 + * 3911 + * Caller must pair it with a successful pci_dev_reset_iommu_prepare(). 3912 + * 3913 + * Note that, although unlikely, there is a risk that re-attaching domains might 3914 + * fail due to some unexpected happening like OOM. 3915 + */ 3916 + void pci_dev_reset_iommu_done(struct pci_dev *pdev) 3917 + { 3918 + struct iommu_group *group = pdev->dev.iommu_group; 3919 + unsigned long pasid; 3920 + void *entry; 3921 + 3922 + if (!pci_ats_supported(pdev) || !dev_has_iommu(&pdev->dev)) 3923 + return; 3924 + 3925 + guard(mutex)(&group->mutex); 3926 + 3927 + /* pci_dev_reset_iommu_prepare() was bypassed for the device */ 3928 + if (!group->resetting_domain) 3929 + return; 3930 + 3931 + /* pci_dev_reset_iommu_prepare() was not successfully called */ 3932 + if (WARN_ON(!group->blocking_domain)) 3933 + return; 3934 + 3935 + /* Re-attach RID domain back to group->domain */ 3936 + if (group->domain != group->blocking_domain) { 3937 + WARN_ON(__iommu_attach_device(group->domain, &pdev->dev, 3938 + group->blocking_domain)); 3939 + } 3940 + 3941 + /* 3942 + * Re-attach PASID domains back to the domains retained in pasid_array. 3943 + * 3944 + * The pasid_array is mostly fenced by group->mutex, except one reader 3945 + * in iommu_attach_handle_get(), so it's safe to read without xa_lock. 3946 + */ 3947 + xa_for_each_start(&group->pasid_array, pasid, entry, 1) 3948 + WARN_ON(__iommu_set_group_pasid( 3949 + pasid_array_entry_to_domain(entry), group, pasid, 3950 + group->blocking_domain)); 3951 + 3952 + group->resetting_domain = NULL; 3953 + } 3954 + EXPORT_SYMBOL_GPL(pci_dev_reset_iommu_done); 3927 3955 3928 3956 #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) 3929 3957 /**

+13 -4

drivers/pci/pci-acpi.c

··· 9 9 10 10 #include <linux/delay.h> 11 11 #include <linux/init.h> 12 + #include <linux/iommu.h> 12 13 #include <linux/irqdomain.h> 13 14 #include <linux/pci.h> 14 15 #include <linux/msi.h> ··· 969 968 int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) 970 969 { 971 970 acpi_handle handle = ACPI_HANDLE(&dev->dev); 971 + int ret; 972 972 973 973 if (!handle || !acpi_has_method(handle, "_RST")) 974 974 return -ENOTTY; ··· 977 975 if (probe) 978 976 return 0; 979 977 980 - if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) { 981 - pci_warn(dev, "ACPI _RST failed\n"); 982 - return -ENOTTY; 978 + ret = pci_dev_reset_iommu_prepare(dev); 979 + if (ret) { 980 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); 981 + return ret; 983 982 } 984 983 985 - return 0; 984 + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_RST", NULL, NULL))) { 985 + pci_warn(dev, "ACPI _RST failed\n"); 986 + ret = -ENOTTY; 987 + } 988 + 989 + pci_dev_reset_iommu_done(dev); 990 + return ret; 986 991 } 987 992 988 993 bool acpi_pci_power_manageable(struct pci_dev *dev)

+58 -7

drivers/pci/pci.c

··· 13 13 #include <linux/delay.h> 14 14 #include <linux/dmi.h> 15 15 #include <linux/init.h> 16 + #include <linux/iommu.h> 16 17 #include <linux/msi.h> 17 18 #include <linux/of.h> 18 19 #include <linux/pci.h> ··· 26 25 #include <linux/logic_pio.h> 27 26 #include <linux/device.h> 28 27 #include <linux/pm_runtime.h> 28 + #include <linux/pci-ats.h> 29 29 #include <linux/pci_hotplug.h> 30 30 #include <linux/vmalloc.h> 31 31 #include <asm/dma.h> ··· 4332 4330 */ 4333 4331 int pcie_flr(struct pci_dev *dev) 4334 4332 { 4333 + int ret; 4334 + 4335 4335 if (!pci_wait_for_pending_transaction(dev)) 4336 4336 pci_err(dev, "timed out waiting for pending transaction; performing function level reset anyway\n"); 4337 + 4338 + /* Have to call it after waiting for pending DMA transaction */ 4339 + ret = pci_dev_reset_iommu_prepare(dev); 4340 + if (ret) { 4341 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); 4342 + return ret; 4343 + } 4337 4344 4338 4345 pcie_capability_set_word(dev, PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_BCR_FLR); 4339 4346 4340 4347 if (dev->imm_ready) 4341 - return 0; 4348 + goto done; 4342 4349 4343 4350 /* 4344 4351 * Per PCIe r4.0, sec 6.6.2, a device must complete an FLR within ··· 4356 4345 */ 4357 4346 msleep(100); 4358 4347 4359 - return pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); 4348 + ret = pci_dev_wait(dev, "FLR", PCIE_RESET_READY_POLL_MS); 4349 + done: 4350 + pci_dev_reset_iommu_done(dev); 4351 + return ret; 4360 4352 } 4361 4353 EXPORT_SYMBOL_GPL(pcie_flr); 4362 4354 ··· 4387 4373 4388 4374 static int pci_af_flr(struct pci_dev *dev, bool probe) 4389 4375 { 4376 + int ret; 4390 4377 int pos; 4391 4378 u8 cap; 4392 4379 ··· 4414 4399 PCI_AF_STATUS_TP << 8)) 4415 4400 pci_err(dev, "timed out waiting for pending transaction; performing AF function level reset anyway\n"); 4416 4401 4402 + /* Have to call it after waiting for pending DMA transaction */ 4403 + ret = pci_dev_reset_iommu_prepare(dev); 4404 + if (ret) { 4405 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); 4406 + return ret; 4407 + } 4408 + 4417 4409 pci_write_config_byte(dev, pos + PCI_AF_CTRL, PCI_AF_CTRL_FLR); 4418 4410 4419 4411 if (dev->imm_ready) 4420 - return 0; 4412 + goto done; 4421 4413 4422 4414 /* 4423 4415 * Per Advanced Capabilities for Conventional PCI ECN, 13 April 2006, ··· 4434 4412 */ 4435 4413 msleep(100); 4436 4414 4437 - return pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); 4415 + ret = pci_dev_wait(dev, "AF_FLR", PCIE_RESET_READY_POLL_MS); 4416 + done: 4417 + pci_dev_reset_iommu_done(dev); 4418 + return ret; 4438 4419 } 4439 4420 4440 4421 /** ··· 4458 4433 static int pci_pm_reset(struct pci_dev *dev, bool probe) 4459 4434 { 4460 4435 u16 csr; 4436 + int ret; 4461 4437 4462 4438 if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET) 4463 4439 return -ENOTTY; ··· 4473 4447 if (dev->current_state != PCI_D0) 4474 4448 return -EINVAL; 4475 4449 4450 + ret = pci_dev_reset_iommu_prepare(dev); 4451 + if (ret) { 4452 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); 4453 + return ret; 4454 + } 4455 + 4476 4456 csr &= ~PCI_PM_CTRL_STATE_MASK; 4477 4457 csr |= PCI_D3hot; 4478 4458 pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); ··· 4489 4457 pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr); 4490 4458 pci_dev_d3_sleep(dev); 4491 4459 4492 - return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); 4460 + ret = pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS); 4461 + pci_dev_reset_iommu_done(dev); 4462 + return ret; 4493 4463 } 4494 4464 4495 4465 /** ··· 4919 4885 return -ENOTTY; 4920 4886 } 4921 4887 4888 + rc = pci_dev_reset_iommu_prepare(dev); 4889 + if (rc) { 4890 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc); 4891 + return rc; 4892 + } 4893 + 4922 4894 rc = pci_dev_reset_slot_function(dev, probe); 4923 4895 if (rc != -ENOTTY) 4924 - return rc; 4925 - return pci_parent_bus_reset(dev, probe); 4896 + goto done; 4897 + 4898 + rc = pci_parent_bus_reset(dev, probe); 4899 + done: 4900 + pci_dev_reset_iommu_done(dev); 4901 + return rc; 4926 4902 } 4927 4903 4928 4904 static int cxl_reset_bus_function(struct pci_dev *dev, bool probe) ··· 4956 4912 if (rc) 4957 4913 return -ENOTTY; 4958 4914 4915 + rc = pci_dev_reset_iommu_prepare(dev); 4916 + if (rc) { 4917 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", rc); 4918 + return rc; 4919 + } 4920 + 4959 4921 if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) { 4960 4922 val = reg; 4961 4923 } else { ··· 4976 4926 pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, 4977 4927 reg); 4978 4928 4929 + pci_dev_reset_iommu_done(dev); 4979 4930 return rc; 4980 4931 } 4981 4932

+18 -1

drivers/pci/quirks.c

··· 21 21 #include <linux/pci.h> 22 22 #include <linux/isa-dma.h> /* isa_dma_bridge_buggy */ 23 23 #include <linux/init.h> 24 + #include <linux/iommu.h> 24 25 #include <linux/delay.h> 25 26 #include <linux/acpi.h> 26 27 #include <linux/dmi.h> ··· 4229 4228 { 0 } 4230 4229 }; 4231 4230 4231 + static int __pci_dev_specific_reset(struct pci_dev *dev, bool probe, 4232 + const struct pci_dev_reset_methods *i) 4233 + { 4234 + int ret; 4235 + 4236 + ret = pci_dev_reset_iommu_prepare(dev); 4237 + if (ret) { 4238 + pci_err(dev, "failed to stop IOMMU for a PCI reset: %d\n", ret); 4239 + return ret; 4240 + } 4241 + 4242 + ret = i->reset(dev, probe); 4243 + pci_dev_reset_iommu_done(dev); 4244 + return ret; 4245 + } 4246 + 4232 4247 /* 4233 4248 * These device-specific reset methods are here rather than in a driver 4234 4249 * because when a host assigns a device to a guest VM, the host may need ··· 4259 4242 i->vendor == (u16)PCI_ANY_ID) && 4260 4243 (i->device == dev->device || 4261 4244 i->device == (u16)PCI_ANY_ID)) 4262 - return i->reset(dev, probe); 4245 + return __pci_dev_specific_reset(dev, probe, i); 4263 4246 } 4264 4247 4265 4248 return -ENOTTY;

+32

include/linux/iommu-debug-pagealloc.h

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2025 - Google Inc 4 + * Author: Mostafa Saleh <smostafa@google.com> 5 + * IOMMU API debug page alloc sanitizer 6 + */ 7 + 8 + #ifndef __LINUX_IOMMU_DEBUG_PAGEALLOC_H 9 + #define __LINUX_IOMMU_DEBUG_PAGEALLOC_H 10 + 11 + #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC 12 + DECLARE_STATIC_KEY_FALSE(iommu_debug_initialized); 13 + 14 + extern struct page_ext_operations page_iommu_debug_ops; 15 + 16 + void __iommu_debug_check_unmapped(const struct page *page, int numpages); 17 + 18 + static inline void iommu_debug_check_unmapped(const struct page *page, int numpages) 19 + { 20 + if (static_branch_unlikely(&iommu_debug_initialized)) 21 + __iommu_debug_check_unmapped(page, numpages); 22 + } 23 + 24 + #else 25 + static inline void iommu_debug_check_unmapped(const struct page *page, 26 + int numpages) 27 + { 28 + } 29 + 30 + #endif /* CONFIG_IOMMU_DEBUG_PAGEALLOC */ 31 + 32 + #endif /* __LINUX_IOMMU_DEBUG_PAGEALLOC_H */

+14

include/linux/iommu.h

··· 910 910 extern void iommu_detach_device(struct iommu_domain *domain, 911 911 struct device *dev); 912 912 extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev); 913 + struct iommu_domain *iommu_driver_get_domain_for_dev(struct device *dev); 913 914 extern struct iommu_domain *iommu_get_dma_domain(struct device *dev); 914 915 extern int iommu_map(struct iommu_domain *domain, unsigned long iova, 915 916 phys_addr_t paddr, size_t size, int prot, gfp_t gfp); ··· 1188 1187 struct device *dev, ioasid_t pasid); 1189 1188 ioasid_t iommu_alloc_global_pasid(struct device *dev); 1190 1189 void iommu_free_global_pasid(ioasid_t pasid); 1190 + 1191 + /* PCI device reset functions */ 1192 + int pci_dev_reset_iommu_prepare(struct pci_dev *pdev); 1193 + void pci_dev_reset_iommu_done(struct pci_dev *pdev); 1191 1194 #else /* CONFIG_IOMMU_API */ 1192 1195 1193 1196 struct iommu_ops {}; ··· 1515 1510 } 1516 1511 1517 1512 static inline void iommu_free_global_pasid(ioasid_t pasid) {} 1513 + 1514 + static inline int pci_dev_reset_iommu_prepare(struct pci_dev *pdev) 1515 + { 1516 + return 0; 1517 + } 1518 + 1519 + static inline void pci_dev_reset_iommu_done(struct pci_dev *pdev) 1520 + { 1521 + } 1518 1522 #endif /* CONFIG_IOMMU_API */ 1519 1523 1520 1524 #ifdef CONFIG_IRQ_MSI_IOMMU

+5

include/linux/mm.h

··· 36 36 #include <linux/rcuwait.h> 37 37 #include <linux/bitmap.h> 38 38 #include <linux/bitops.h> 39 + #include <linux/iommu-debug-pagealloc.h> 39 40 40 41 struct mempolicy; 41 42 struct anon_vma; ··· 4115 4114 #ifdef CONFIG_DEBUG_PAGEALLOC 4116 4115 static inline void debug_pagealloc_map_pages(struct page *page, int numpages) 4117 4116 { 4117 + iommu_debug_check_unmapped(page, numpages); 4118 + 4118 4119 if (debug_pagealloc_enabled_static()) 4119 4120 __kernel_map_pages(page, numpages, 1); 4120 4121 } 4121 4122 4122 4123 static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) 4123 4124 { 4125 + iommu_debug_check_unmapped(page, numpages); 4126 + 4124 4127 if (debug_pagealloc_enabled_static()) 4125 4128 __kernel_map_pages(page, numpages, 0); 4126 4129 }

+6

include/linux/page_ext.h

··· 93 93 #endif 94 94 95 95 extern struct page_ext *page_ext_get(const struct page *page); 96 + extern struct page_ext *page_ext_from_phys(phys_addr_t phys); 96 97 extern void page_ext_put(struct page_ext *page_ext); 97 98 extern struct page_ext *page_ext_lookup(unsigned long pfn); 98 99 ··· 212 211 } 213 212 214 213 static inline struct page_ext *page_ext_get(const struct page *page) 214 + { 215 + return NULL; 216 + } 217 + 218 + static inline struct page_ext *page_ext_from_phys(phys_addr_t phys) 215 219 { 216 220 return NULL; 217 221 }

+39

include/uapi/linux/iommufd.h

··· 466 466 }; 467 467 468 468 /** 469 + * struct iommu_hwpt_amd_guest - AMD IOMMU guest I/O page table data 470 + * (IOMMU_HWPT_DATA_AMD_GUEST) 471 + * @dte: Guest Device Table Entry (DTE) 472 + */ 473 + struct iommu_hwpt_amd_guest { 474 + __aligned_u64 dte[4]; 475 + }; 476 + 477 + /** 469 478 * enum iommu_hwpt_data_type - IOMMU HWPT Data Type 470 479 * @IOMMU_HWPT_DATA_NONE: no data 471 480 * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table 472 481 * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table 482 + * @IOMMU_HWPT_DATA_AMD_GUEST: AMD IOMMU guest page table 473 483 */ 474 484 enum iommu_hwpt_data_type { 475 485 IOMMU_HWPT_DATA_NONE = 0, 476 486 IOMMU_HWPT_DATA_VTD_S1 = 1, 477 487 IOMMU_HWPT_DATA_ARM_SMMUV3 = 2, 488 + IOMMU_HWPT_DATA_AMD_GUEST = 3, 478 489 }; 479 490 480 491 /** ··· 635 624 }; 636 625 637 626 /** 627 + * struct iommu_hw_info_amd - AMD IOMMU device info 628 + * 629 + * @efr : Value of AMD IOMMU Extended Feature Register (EFR) 630 + * @efr2: Value of AMD IOMMU Extended Feature 2 Register (EFR2) 631 + * 632 + * Please See description of these registers in the following sections of 633 + * the AMD I/O Virtualization Technology (IOMMU) Specification. 634 + * (https://docs.amd.com/v/u/en-US/48882_3.10_PUB) 635 + * 636 + * - MMIO Offset 0030h IOMMU Extended Feature Register 637 + * - MMIO Offset 01A0h IOMMU Extended Feature 2 Register 638 + * 639 + * Note: The EFR and EFR2 are raw values reported by hardware. 640 + * VMM is responsible to determine the appropriate flags to be exposed to 641 + * the VM since cetertain features are not currently supported by the kernel 642 + * for HW-vIOMMU. 643 + * 644 + * Current VMM-allowed list of feature flags are: 645 + * - EFR[GTSup, GASup, GioSup, PPRSup, EPHSup, GATS, GLX, PASmax] 646 + */ 647 + struct iommu_hw_info_amd { 648 + __aligned_u64 efr; 649 + __aligned_u64 efr2; 650 + }; 651 + 652 + /** 638 653 * enum iommu_hw_info_type - IOMMU Hardware Info Types 639 654 * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware 640 655 * info ··· 669 632 * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type 670 633 * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM 671 634 * SMMUv3) info type 635 + * @IOMMU_HW_INFO_TYPE_AMD: AMD IOMMU info type 672 636 */ 673 637 enum iommu_hw_info_type { 674 638 IOMMU_HW_INFO_TYPE_NONE = 0, ··· 677 639 IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, 678 640 IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, 679 641 IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, 642 + IOMMU_HW_INFO_TYPE_AMD = 4, 680 643 }; 681 644 682 645 /**

+4

include/uapi/linux/vfio.h

··· 964 964 * hwpt corresponding to the given pt_id. 965 965 * 966 966 * Return: 0 on success, -errno on failure. 967 + * 968 + * When a device is resetting, -EBUSY will be returned to reject any concurrent 969 + * attachment to the resetting device itself or any sibling device in the IOMMU 970 + * group having the resetting device. 967 971 */ 968 972 struct vfio_device_attach_iommufd_pt { 969 973 __u32 argsz;

+27

mm/page_ext.c

··· 11 11 #include <linux/page_table_check.h> 12 12 #include <linux/rcupdate.h> 13 13 #include <linux/pgalloc_tag.h> 14 + #include <linux/iommu-debug-pagealloc.h> 14 15 15 16 /* 16 17 * struct page extension ··· 89 88 #endif 90 89 #ifdef CONFIG_PAGE_TABLE_CHECK 91 90 &page_table_check_ops, 91 + #endif 92 + #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC 93 + &page_iommu_debug_ops, 92 94 #endif 93 95 }; 94 96 ··· 536 532 } 537 533 538 534 return page_ext; 535 + } 536 + 537 + /** 538 + * page_ext_from_phys() - Get the page_ext structure for a physical address. 539 + * @phys: The physical address to query. 540 + * 541 + * This function safely gets the `struct page_ext` associated with a given 542 + * physical address. It performs validation to ensure the address corresponds 543 + * to a valid, online struct page before attempting to access it. 544 + * It returns NULL for MMIO, ZONE_DEVICE, holes and offline memory. 545 + * 546 + * Return: NULL if no page_ext exists for this physical address. 547 + * Context: Any context. Caller may not sleep until they have called 548 + * page_ext_put(). 549 + */ 550 + struct page_ext *page_ext_from_phys(phys_addr_t phys) 551 + { 552 + struct page *page = pfn_to_online_page(__phys_to_pfn(phys)); 553 + 554 + if (!page) 555 + return NULL; 556 + 557 + return page_ext_get(page); 539 558 } 540 559 541 560 /**

+2 -1

rust/bindings/bindings_helper.h

··· 56 56 #include <linux/fdtable.h> 57 57 #include <linux/file.h> 58 58 #include <linux/firmware.h> 59 - #include <linux/interrupt.h> 60 59 #include <linux/fs.h> 61 60 #include <linux/i2c.h> 61 + #include <linux/interrupt.h> 62 + #include <linux/io-pgtable.h> 62 63 #include <linux/ioport.h> 63 64 #include <linux/jiffies.h> 64 65 #include <linux/jump_label.h>

+5

rust/kernel/iommu/mod.rs

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + //! Rust support related to IOMMU. 4 + 5 + pub mod pgtable;

+279

rust/kernel/iommu/pgtable.rs

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + //! IOMMU page table management. 4 + //! 5 + //! C header: [`include/linux/io-pgtable.h`](srctree/include/linux/io-pgtable.h) 6 + 7 + use core::{ 8 + marker::PhantomData, 9 + ptr::NonNull, // 10 + }; 11 + 12 + use crate::{ 13 + alloc, 14 + bindings, 15 + device::{ 16 + Bound, 17 + Device, // 18 + }, 19 + devres::Devres, 20 + error::to_result, 21 + io::PhysAddr, 22 + prelude::*, // 23 + }; 24 + 25 + use bindings::io_pgtable_fmt; 26 + 27 + /// Protection flags used with IOMMU mappings. 28 + pub mod prot { 29 + /// Read access. 30 + pub const READ: u32 = bindings::IOMMU_READ; 31 + /// Write access. 32 + pub const WRITE: u32 = bindings::IOMMU_WRITE; 33 + /// Request cache coherency. 34 + pub const CACHE: u32 = bindings::IOMMU_CACHE; 35 + /// Request no-execute permission. 36 + pub const NOEXEC: u32 = bindings::IOMMU_NOEXEC; 37 + /// MMIO peripheral mapping. 38 + pub const MMIO: u32 = bindings::IOMMU_MMIO; 39 + /// Privileged mapping. 40 + pub const PRIVILEGED: u32 = bindings::IOMMU_PRIV; 41 + } 42 + 43 + /// Represents a requested `io_pgtable` configuration. 44 + pub struct Config { 45 + /// Quirk bitmask (type-specific). 46 + pub quirks: usize, 47 + /// Valid page sizes, as a bitmask of powers of two. 48 + pub pgsize_bitmap: usize, 49 + /// Input address space size in bits. 50 + pub ias: u32, 51 + /// Output address space size in bits. 52 + pub oas: u32, 53 + /// IOMMU uses coherent accesses for page table walks. 54 + pub coherent_walk: bool, 55 + } 56 + 57 + /// An io page table using a specific format. 58 + /// 59 + /// # Invariants 60 + /// 61 + /// The pointer references a valid io page table. 62 + pub struct IoPageTable<F: IoPageTableFmt> { 63 + ptr: NonNull<bindings::io_pgtable_ops>, 64 + _marker: PhantomData<F>, 65 + } 66 + 67 + // SAFETY: `struct io_pgtable_ops` is not restricted to a single thread. 68 + unsafe impl<F: IoPageTableFmt> Send for IoPageTable<F> {} 69 + // SAFETY: `struct io_pgtable_ops` may be accessed concurrently. 70 + unsafe impl<F: IoPageTableFmt> Sync for IoPageTable<F> {} 71 + 72 + /// The format used by this page table. 73 + pub trait IoPageTableFmt: 'static { 74 + /// The value representing this format. 75 + const FORMAT: io_pgtable_fmt; 76 + } 77 + 78 + impl<F: IoPageTableFmt> IoPageTable<F> { 79 + /// Create a new `IoPageTable` as a device resource. 80 + #[inline] 81 + pub fn new( 82 + dev: &Device<Bound>, 83 + config: Config, 84 + ) -> impl PinInit<Devres<IoPageTable<F>>, Error> + '_ { 85 + // SAFETY: Devres ensures that the value is dropped during device unbind. 86 + Devres::new(dev, unsafe { Self::new_raw(dev, config) }) 87 + } 88 + 89 + /// Create a new `IoPageTable`. 90 + /// 91 + /// # Safety 92 + /// 93 + /// If successful, then the returned `IoPageTable` must be dropped before the device is 94 + /// unbound. 95 + #[inline] 96 + pub unsafe fn new_raw(dev: &Device<Bound>, config: Config) -> Result<IoPageTable<F>> { 97 + let mut raw_cfg = bindings::io_pgtable_cfg { 98 + quirks: config.quirks, 99 + pgsize_bitmap: config.pgsize_bitmap, 100 + ias: config.ias, 101 + oas: config.oas, 102 + coherent_walk: config.coherent_walk, 103 + tlb: &raw const NOOP_FLUSH_OPS, 104 + iommu_dev: dev.as_raw(), 105 + // SAFETY: All zeroes is a valid value for `struct io_pgtable_cfg`. 106 + ..unsafe { core::mem::zeroed() } 107 + }; 108 + 109 + // SAFETY: 110 + // * The raw_cfg pointer is valid for the duration of this call. 111 + // * The provided `FLUSH_OPS` contains valid function pointers that accept a null pointer 112 + // as cookie. 113 + // * The caller ensures that the io pgtable does not outlive the device. 114 + let ops = unsafe { 115 + bindings::alloc_io_pgtable_ops(F::FORMAT, &mut raw_cfg, core::ptr::null_mut()) 116 + }; 117 + 118 + // INVARIANT: We successfully created a valid page table. 119 + Ok(IoPageTable { 120 + ptr: NonNull::new(ops).ok_or(ENOMEM)?, 121 + _marker: PhantomData, 122 + }) 123 + } 124 + 125 + /// Obtain a raw pointer to the underlying `struct io_pgtable_ops`. 126 + #[inline] 127 + pub fn raw_ops(&self) -> *mut bindings::io_pgtable_ops { 128 + self.ptr.as_ptr() 129 + } 130 + 131 + /// Obtain a raw pointer to the underlying `struct io_pgtable`. 132 + #[inline] 133 + pub fn raw_pgtable(&self) -> *mut bindings::io_pgtable { 134 + // SAFETY: The io_pgtable_ops of an io-pgtable is always the ops field of a io_pgtable. 135 + unsafe { kernel::container_of!(self.raw_ops(), bindings::io_pgtable, ops) } 136 + } 137 + 138 + /// Obtain a raw pointer to the underlying `struct io_pgtable_cfg`. 139 + #[inline] 140 + pub fn raw_cfg(&self) -> *mut bindings::io_pgtable_cfg { 141 + // SAFETY: The `raw_pgtable()` method returns a valid pointer. 142 + unsafe { &raw mut (*self.raw_pgtable()).cfg } 143 + } 144 + 145 + /// Map a physically contiguous range of pages of the same size. 146 + /// 147 + /// Even if successful, this operation may not map the entire range. In that case, only a 148 + /// prefix of the range is mapped, and the returned integer indicates its length in bytes. In 149 + /// this case, the caller will usually call `map_pages` again for the remaining range. 150 + /// 151 + /// The returned [`Result`] indicates whether an error was encountered while mapping pages. 152 + /// Note that this may return a non-zero length even if an error was encountered. The caller 153 + /// will usually [unmap the relevant pages](Self::unmap_pages) on error. 154 + /// 155 + /// The caller must flush the TLB before using the pgtable to access the newly created mapping. 156 + /// 157 + /// # Safety 158 + /// 159 + /// * No other io-pgtable operation may access the range `iova .. iova+pgsize*pgcount` while 160 + /// this `map_pages` operation executes. 161 + /// * This page table must not contain any mapping that overlaps with the mapping created by 162 + /// this call. 163 + /// * If this page table is live, then the caller must ensure that it's okay to access the 164 + /// physical address being mapped for the duration in which it is mapped. 165 + #[inline] 166 + pub unsafe fn map_pages( 167 + &self, 168 + iova: usize, 169 + paddr: PhysAddr, 170 + pgsize: usize, 171 + pgcount: usize, 172 + prot: u32, 173 + flags: alloc::Flags, 174 + ) -> (usize, Result) { 175 + let mut mapped: usize = 0; 176 + 177 + // SAFETY: The `map_pages` function in `io_pgtable_ops` is never null. 178 + let map_pages = unsafe { (*self.raw_ops()).map_pages.unwrap_unchecked() }; 179 + 180 + // SAFETY: The safety requirements of this method are sufficient to call `map_pages`. 181 + let ret = to_result(unsafe { 182 + (map_pages)( 183 + self.raw_ops(), 184 + iova, 185 + paddr, 186 + pgsize, 187 + pgcount, 188 + prot as i32, 189 + flags.as_raw(), 190 + &mut mapped, 191 + ) 192 + }); 193 + 194 + (mapped, ret) 195 + } 196 + 197 + /// Unmap a range of virtually contiguous pages of the same size. 198 + /// 199 + /// This may not unmap the entire range, and returns the length of the unmapped prefix in 200 + /// bytes. 201 + /// 202 + /// # Safety 203 + /// 204 + /// * No other io-pgtable operation may access the range `iova .. iova+pgsize*pgcount` while 205 + /// this `unmap_pages` operation executes. 206 + /// * This page table must contain one or more consecutive mappings starting at `iova` whose 207 + /// total size is `pgcount * pgsize`. 208 + #[inline] 209 + #[must_use] 210 + pub unsafe fn unmap_pages(&self, iova: usize, pgsize: usize, pgcount: usize) -> usize { 211 + // SAFETY: The `unmap_pages` function in `io_pgtable_ops` is never null. 212 + let unmap_pages = unsafe { (*self.raw_ops()).unmap_pages.unwrap_unchecked() }; 213 + 214 + // SAFETY: The safety requirements of this method are sufficient to call `unmap_pages`. 215 + unsafe { (unmap_pages)(self.raw_ops(), iova, pgsize, pgcount, core::ptr::null_mut()) } 216 + } 217 + } 218 + 219 + // For the initial users of these rust bindings, the GPU FW is managing the IOTLB and performs all 220 + // required invalidations using a range. There is no need for it get ARM style invalidation 221 + // instructions from the page table code. 222 + // 223 + // Support for flushing the TLB with ARM style invalidation instructions may be added in the 224 + // future. 225 + static NOOP_FLUSH_OPS: bindings::iommu_flush_ops = bindings::iommu_flush_ops { 226 + tlb_flush_all: Some(rust_tlb_flush_all_noop), 227 + tlb_flush_walk: Some(rust_tlb_flush_walk_noop), 228 + tlb_add_page: None, 229 + }; 230 + 231 + #[no_mangle] 232 + extern "C" fn rust_tlb_flush_all_noop(_cookie: *mut core::ffi::c_void) {} 233 + 234 + #[no_mangle] 235 + extern "C" fn rust_tlb_flush_walk_noop( 236 + _iova: usize, 237 + _size: usize, 238 + _granule: usize, 239 + _cookie: *mut core::ffi::c_void, 240 + ) { 241 + } 242 + 243 + impl<F: IoPageTableFmt> Drop for IoPageTable<F> { 244 + fn drop(&mut self) { 245 + // SAFETY: The caller of `Self::ttbr()` promised that the page table is not live when this 246 + // destructor runs. 247 + unsafe { bindings::free_io_pgtable_ops(self.raw_ops()) }; 248 + } 249 + } 250 + 251 + /// The `ARM_64_LPAE_S1` page table format. 252 + pub enum ARM64LPAES1 {} 253 + 254 + impl IoPageTableFmt for ARM64LPAES1 { 255 + const FORMAT: io_pgtable_fmt = bindings::io_pgtable_fmt_ARM_64_LPAE_S1 as io_pgtable_fmt; 256 + } 257 + 258 + impl IoPageTable<ARM64LPAES1> { 259 + /// Access the `ttbr` field of the configuration. 260 + /// 261 + /// This is the physical address of the page table, which may be passed to the device that 262 + /// needs to use it. 263 + /// 264 + /// # Safety 265 + /// 266 + /// The caller must ensure that the device stops using the page table before dropping it. 267 + #[inline] 268 + pub unsafe fn ttbr(&self) -> u64 { 269 + // SAFETY: `arm_lpae_s1_cfg` is the right cfg type for `ARM64LPAES1`. 270 + unsafe { (*self.raw_cfg()).__bindgen_anon_1.arm_lpae_s1_cfg.ttbr } 271 + } 272 + 273 + /// Access the `mair` field of the configuration. 274 + #[inline] 275 + pub fn mair(&self) -> u64 { 276 + // SAFETY: `arm_lpae_s1_cfg` is the right cfg type for `ARM64LPAES1`. 277 + unsafe { (*self.raw_cfg()).__bindgen_anon_1.arm_lpae_s1_cfg.mair } 278 + } 279 + }

+1

rust/kernel/lib.rs

··· 105 105 pub mod init; 106 106 pub mod io; 107 107 pub mod ioctl; 108 + pub mod iommu; 108 109 pub mod iov; 109 110 pub mod irq; 110 111 pub mod jump_label;

Configure Feed

Configure Feed