Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull iommu fixes from Joerg Roedel:
"A couple of fixes for the AMD IOMMU driver have piled up:

- Some fixes for the reworked IO page-table which caused memory leaks
or did not allow to downgrade mappings under some conditions.

- Locking fixes to fix a couple of possible races around accessing
'struct protection_domain'. The races got introduced when the
dma-ops path became lock-less in the fast-path"

* tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
iommu/amd: Lock code paths traversing protection_domain->dev_list
iommu/amd: Lock dev_data in attach/detach code paths
iommu/amd: Check for busy devices earlier in attach_device()
iommu/amd: Take domain->lock for complete attach/detach path
iommu/amd: Remove amd_iommu_devtable_lock
iommu/amd: Remove domain->updated
iommu/amd: Wait for completion of IOTLB flush in attach_device
iommu/amd: Unmap all L7 PTEs when downgrading page-sizes
iommu/amd: Introduce first_pte_l7() helper
iommu/amd: Fix downgrading default page-sizes in alloc_pte()
iommu/amd: Fix pages leak in free_pagetable()

+141 -96
+138 -95
drivers/iommu/amd_iommu.c
··· 70 70 */ 71 71 #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) 72 72 73 - static DEFINE_SPINLOCK(amd_iommu_devtable_lock); 74 73 static DEFINE_SPINLOCK(pd_bitmap_lock); 75 74 76 75 /* List of all available dev_data structures */ ··· 201 202 if (!dev_data) 202 203 return NULL; 203 204 205 + spin_lock_init(&dev_data->lock); 204 206 dev_data->devid = devid; 205 207 ratelimit_default_init(&dev_data->rs); 206 208 ··· 499 499 * We keep dev_data around for unplugged devices and reuse it when the 500 500 * device is re-plugged - not doing so would introduce a ton of races. 501 501 */ 502 + } 503 + 504 + /* 505 + * Helper function to get the first pte of a large mapping 506 + */ 507 + static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 508 + unsigned long *count) 509 + { 510 + unsigned long pte_mask, pg_size, cnt; 511 + u64 *fpte; 512 + 513 + pg_size = PTE_PAGE_SIZE(*pte); 514 + cnt = PAGE_SIZE_PTE_COUNT(pg_size); 515 + pte_mask = ~((cnt << 3) - 1); 516 + fpte = (u64 *)(((unsigned long)pte) & pte_mask); 517 + 518 + if (page_size) 519 + *page_size = pg_size; 520 + 521 + if (count) 522 + *count = cnt; 523 + 524 + return fpte; 502 525 } 503 526 504 527 /**************************************************************************** ··· 1334 1311 dma_addr_t iova, size_t size) 1335 1312 { 1336 1313 if (unlikely(amd_iommu_np_cache)) { 1314 + unsigned long flags; 1315 + 1316 + spin_lock_irqsave(&domain->lock, flags); 1337 1317 domain_flush_pages(domain, iova, size); 1338 1318 domain_flush_complete(domain); 1319 + spin_unlock_irqrestore(&domain->lock, flags); 1339 1320 } 1340 1321 } 1341 1322 ··· 1452 1425 BUG_ON(domain->mode < PAGE_MODE_NONE || 1453 1426 domain->mode > PAGE_MODE_6_LEVEL); 1454 1427 1455 - free_sub_pt(root, domain->mode, freelist); 1428 + freelist = free_sub_pt(root, domain->mode, freelist); 1456 1429 1457 1430 free_page_list(freelist); 1458 1431 } ··· 1462 1435 * another level increases the size of the address space by 9 bits to a size up 1463 1436 * to 64 bits. 1464 1437 */ 1465 - static void increase_address_space(struct protection_domain *domain, 1438 + static bool increase_address_space(struct protection_domain *domain, 1466 1439 gfp_t gfp) 1467 1440 { 1468 1441 unsigned long flags; 1442 + bool ret = false; 1469 1443 u64 *pte; 1470 1444 1471 1445 spin_lock_irqsave(&domain->lock, flags); ··· 1483 1455 iommu_virt_to_phys(domain->pt_root)); 1484 1456 domain->pt_root = pte; 1485 1457 domain->mode += 1; 1486 - domain->updated = true; 1458 + 1459 + ret = true; 1487 1460 1488 1461 out: 1489 1462 spin_unlock_irqrestore(&domain->lock, flags); 1490 1463 1491 - return; 1464 + return ret; 1492 1465 } 1493 1466 1494 1467 static u64 *alloc_pte(struct protection_domain *domain, 1495 1468 unsigned long address, 1496 1469 unsigned long page_size, 1497 1470 u64 **pte_page, 1498 - gfp_t gfp) 1471 + gfp_t gfp, 1472 + bool *updated) 1499 1473 { 1500 1474 int level, end_lvl; 1501 1475 u64 *pte, *page; ··· 1505 1475 BUG_ON(!is_power_of_2(page_size)); 1506 1476 1507 1477 while (address > PM_LEVEL_SIZE(domain->mode)) 1508 - increase_address_space(domain, gfp); 1478 + *updated = increase_address_space(domain, gfp) || *updated; 1509 1479 1510 1480 level = domain->mode - 1; 1511 1481 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; ··· 1519 1489 __pte = *pte; 1520 1490 pte_level = PM_PTE_LEVEL(__pte); 1521 1491 1522 - if (!IOMMU_PTE_PRESENT(__pte) || 1492 + /* 1493 + * If we replace a series of large PTEs, we need 1494 + * to tear down all of them. 1495 + */ 1496 + if (IOMMU_PTE_PRESENT(__pte) && 1523 1497 pte_level == PAGE_MODE_7_LEVEL) { 1498 + unsigned long count, i; 1499 + u64 *lpte; 1500 + 1501 + lpte = first_pte_l7(pte, NULL, &count); 1502 + 1503 + /* 1504 + * Unmap the replicated PTEs that still match the 1505 + * original large mapping 1506 + */ 1507 + for (i = 0; i < count; ++i) 1508 + cmpxchg64(&lpte[i], __pte, 0ULL); 1509 + 1510 + *updated = true; 1511 + continue; 1512 + } 1513 + 1514 + if (!IOMMU_PTE_PRESENT(__pte) || 1515 + pte_level == PAGE_MODE_NONE) { 1524 1516 page = (u64 *)get_zeroed_page(gfp); 1517 + 1525 1518 if (!page) 1526 1519 return NULL; 1527 1520 ··· 1553 1500 /* pte could have been changed somewhere. */ 1554 1501 if (cmpxchg64(pte, __pte, __npte) != __pte) 1555 1502 free_page((unsigned long)page); 1556 - else if (pte_level == PAGE_MODE_7_LEVEL) 1557 - domain->updated = true; 1503 + else if (IOMMU_PTE_PRESENT(__pte)) 1504 + *updated = true; 1558 1505 1559 1506 continue; 1560 1507 } ··· 1619 1566 *page_size = PTE_LEVEL_PAGE_SIZE(level); 1620 1567 } 1621 1568 1622 - if (PM_PTE_LEVEL(*pte) == 0x07) { 1623 - unsigned long pte_mask; 1624 - 1625 - /* 1626 - * If we have a series of large PTEs, make 1627 - * sure to return a pointer to the first one. 1628 - */ 1629 - *page_size = pte_mask = PTE_PAGE_SIZE(*pte); 1630 - pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); 1631 - pte = (u64 *)(((unsigned long)pte) & pte_mask); 1632 - } 1569 + /* 1570 + * If we have a series of large PTEs, make 1571 + * sure to return a pointer to the first one. 1572 + */ 1573 + if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 1574 + pte = first_pte_l7(pte, page_size, NULL); 1633 1575 1634 1576 return pte; 1635 1577 } ··· 1663 1615 gfp_t gfp) 1664 1616 { 1665 1617 struct page *freelist = NULL; 1618 + bool updated = false; 1666 1619 u64 __pte, *pte; 1667 - int i, count; 1620 + int ret, i, count; 1668 1621 1669 1622 BUG_ON(!IS_ALIGNED(bus_addr, page_size)); 1670 1623 BUG_ON(!IS_ALIGNED(phys_addr, page_size)); 1671 1624 1625 + ret = -EINVAL; 1672 1626 if (!(prot & IOMMU_PROT_MASK)) 1673 - return -EINVAL; 1627 + goto out; 1674 1628 1675 1629 count = PAGE_SIZE_PTE_COUNT(page_size); 1676 - pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp); 1630 + pte = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated); 1677 1631 1632 + ret = -ENOMEM; 1678 1633 if (!pte) 1679 - return -ENOMEM; 1634 + goto out; 1680 1635 1681 1636 for (i = 0; i < count; ++i) 1682 1637 freelist = free_clear_pte(&pte[i], pte[i], freelist); 1683 1638 1684 1639 if (freelist != NULL) 1685 - dom->updated = true; 1640 + updated = true; 1686 1641 1687 1642 if (count > 1) { 1688 1643 __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); ··· 1701 1650 for (i = 0; i < count; ++i) 1702 1651 pte[i] = __pte; 1703 1652 1704 - update_domain(dom); 1653 + ret = 0; 1654 + 1655 + out: 1656 + if (updated) { 1657 + unsigned long flags; 1658 + 1659 + spin_lock_irqsave(&dom->lock, flags); 1660 + update_domain(dom); 1661 + spin_unlock_irqrestore(&dom->lock, flags); 1662 + } 1705 1663 1706 1664 /* Everything flushed out, free pages now */ 1707 1665 free_page_list(freelist); 1708 1666 1709 - return 0; 1667 + return ret; 1710 1668 } 1711 1669 1712 1670 static unsigned long iommu_unmap_page(struct protection_domain *dom, ··· 1866 1806 1867 1807 static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom) 1868 1808 { 1809 + unsigned long flags; 1810 + 1811 + spin_lock_irqsave(&dom->domain.lock, flags); 1869 1812 domain_flush_tlb(&dom->domain); 1870 1813 domain_flush_complete(&dom->domain); 1814 + spin_unlock_irqrestore(&dom->domain.lock, flags); 1871 1815 } 1872 1816 1873 1817 static void iova_domain_flush_tlb(struct iova_domain *iovad) ··· 2086 2022 domain->dev_cnt -= 1; 2087 2023 } 2088 2024 2089 - /* 2090 - * If a device is not yet associated with a domain, this function makes the 2091 - * device visible in the domain 2092 - */ 2093 - static int __attach_device(struct iommu_dev_data *dev_data, 2094 - struct protection_domain *domain) 2095 - { 2096 - int ret; 2097 - 2098 - /* lock domain */ 2099 - spin_lock(&domain->lock); 2100 - 2101 - ret = -EBUSY; 2102 - if (dev_data->domain != NULL) 2103 - goto out_unlock; 2104 - 2105 - /* Attach alias group root */ 2106 - do_attach(dev_data, domain); 2107 - 2108 - ret = 0; 2109 - 2110 - out_unlock: 2111 - 2112 - /* ready */ 2113 - spin_unlock(&domain->lock); 2114 - 2115 - return ret; 2116 - } 2117 - 2118 - 2119 2025 static void pdev_iommuv2_disable(struct pci_dev *pdev) 2120 2026 { 2121 2027 pci_disable_ats(pdev); ··· 2167 2133 unsigned long flags; 2168 2134 int ret; 2169 2135 2136 + spin_lock_irqsave(&domain->lock, flags); 2137 + 2170 2138 dev_data = get_dev_data(dev); 2139 + 2140 + spin_lock(&dev_data->lock); 2141 + 2142 + ret = -EBUSY; 2143 + if (dev_data->domain != NULL) 2144 + goto out; 2171 2145 2172 2146 if (!dev_is_pci(dev)) 2173 2147 goto skip_ats_check; 2174 2148 2175 2149 pdev = to_pci_dev(dev); 2176 2150 if (domain->flags & PD_IOMMUV2_MASK) { 2151 + ret = -EINVAL; 2177 2152 if (!dev_data->passthrough) 2178 - return -EINVAL; 2153 + goto out; 2179 2154 2180 2155 if (dev_data->iommu_v2) { 2181 2156 if (pdev_iommuv2_enable(pdev) != 0) 2182 - return -EINVAL; 2157 + goto out; 2183 2158 2184 2159 dev_data->ats.enabled = true; 2185 2160 dev_data->ats.qdep = pci_ats_queue_depth(pdev); ··· 2201 2158 } 2202 2159 2203 2160 skip_ats_check: 2204 - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); 2205 - ret = __attach_device(dev_data, domain); 2206 - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2161 + ret = 0; 2162 + 2163 + do_attach(dev_data, domain); 2207 2164 2208 2165 /* 2209 2166 * We might boot into a crash-kernel here. The crashed kernel ··· 2212 2169 */ 2213 2170 domain_flush_tlb_pde(domain); 2214 2171 2172 + domain_flush_complete(domain); 2173 + 2174 + out: 2175 + spin_unlock(&dev_data->lock); 2176 + 2177 + spin_unlock_irqrestore(&domain->lock, flags); 2178 + 2215 2179 return ret; 2216 - } 2217 - 2218 - /* 2219 - * Removes a device from a protection domain (unlocked) 2220 - */ 2221 - static void __detach_device(struct iommu_dev_data *dev_data) 2222 - { 2223 - struct protection_domain *domain; 2224 - 2225 - domain = dev_data->domain; 2226 - 2227 - spin_lock(&domain->lock); 2228 - 2229 - do_detach(dev_data); 2230 - 2231 - spin_unlock(&domain->lock); 2232 2180 } 2233 2181 2234 2182 /* ··· 2234 2200 dev_data = get_dev_data(dev); 2235 2201 domain = dev_data->domain; 2236 2202 2203 + spin_lock_irqsave(&domain->lock, flags); 2204 + 2205 + spin_lock(&dev_data->lock); 2206 + 2237 2207 /* 2238 2208 * First check if the device is still attached. It might already 2239 2209 * be detached from its domain because the generic ··· 2245 2207 * our alias handling. 2246 2208 */ 2247 2209 if (WARN_ON(!dev_data->domain)) 2248 - return; 2210 + goto out; 2249 2211 2250 - /* lock device table */ 2251 - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); 2252 - __detach_device(dev_data); 2253 - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2212 + do_detach(dev_data); 2254 2213 2255 2214 if (!dev_is_pci(dev)) 2256 - return; 2215 + goto out; 2257 2216 2258 2217 if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2) 2259 2218 pdev_iommuv2_disable(to_pci_dev(dev)); ··· 2258 2223 pci_disable_ats(to_pci_dev(dev)); 2259 2224 2260 2225 dev_data->ats.enabled = false; 2226 + 2227 + out: 2228 + spin_unlock(&dev_data->lock); 2229 + 2230 + spin_unlock_irqrestore(&domain->lock, flags); 2261 2231 } 2262 2232 2263 2233 static int amd_iommu_add_device(struct device *dev) ··· 2394 2354 2395 2355 static void update_domain(struct protection_domain *domain) 2396 2356 { 2397 - if (!domain->updated) 2398 - return; 2399 - 2400 2357 update_device_table(domain); 2401 2358 2402 2359 domain_flush_devices(domain); 2403 2360 domain_flush_tlb_pde(domain); 2404 - 2405 - domain->updated = false; 2406 2361 } 2407 2362 2408 2363 static int dir2prot(enum dma_data_direction direction) ··· 2427 2392 { 2428 2393 dma_addr_t offset = paddr & ~PAGE_MASK; 2429 2394 dma_addr_t address, start, ret; 2395 + unsigned long flags; 2430 2396 unsigned int pages; 2431 2397 int prot = 0; 2432 2398 int i; ··· 2465 2429 iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE); 2466 2430 } 2467 2431 2432 + spin_lock_irqsave(&dma_dom->domain.lock, flags); 2468 2433 domain_flush_tlb(&dma_dom->domain); 2469 2434 domain_flush_complete(&dma_dom->domain); 2435 + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); 2470 2436 2471 2437 dma_ops_free_iova(dma_dom, address, pages); 2472 2438 ··· 2497 2459 } 2498 2460 2499 2461 if (amd_iommu_unmap_flush) { 2462 + unsigned long flags; 2463 + 2464 + spin_lock_irqsave(&dma_dom->domain.lock, flags); 2500 2465 domain_flush_tlb(&dma_dom->domain); 2501 2466 domain_flush_complete(&dma_dom->domain); 2467 + spin_unlock_irqrestore(&dma_dom->domain.lock, flags); 2502 2468 dma_ops_free_iova(dma_dom, dma_addr, pages); 2503 2469 } else { 2504 2470 pages = __roundup_pow_of_two(pages); ··· 2908 2866 struct iommu_dev_data *entry; 2909 2867 unsigned long flags; 2910 2868 2911 - spin_lock_irqsave(&amd_iommu_devtable_lock, flags); 2869 + spin_lock_irqsave(&domain->lock, flags); 2912 2870 2913 2871 while (!list_empty(&domain->dev_list)) { 2914 2872 entry = list_first_entry(&domain->dev_list, 2915 2873 struct iommu_dev_data, list); 2916 2874 BUG_ON(!entry->domain); 2917 - __detach_device(entry); 2875 + do_detach(entry); 2918 2876 } 2919 2877 2920 - spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2878 + spin_unlock_irqrestore(&domain->lock, flags); 2921 2879 } 2922 2880 2923 2881 static void protection_domain_free(struct protection_domain *domain) ··· 3268 3226 static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 3269 3227 { 3270 3228 struct protection_domain *dom = to_pdomain(domain); 3229 + unsigned long flags; 3271 3230 3231 + spin_lock_irqsave(&dom->lock, flags); 3272 3232 domain_flush_tlb_pde(dom); 3273 3233 domain_flush_complete(dom); 3234 + spin_unlock_irqrestore(&dom->lock, flags); 3274 3235 } 3275 3236 3276 3237 static void amd_iommu_iotlb_sync(struct iommu_domain *domain, ··· 3335 3290 3336 3291 /* Update data structure */ 3337 3292 domain->mode = PAGE_MODE_NONE; 3338 - domain->updated = true; 3339 3293 3340 3294 /* Make changes visible to IOMMUs */ 3341 3295 update_domain(domain); ··· 3380 3336 3381 3337 domain->glx = levels; 3382 3338 domain->flags |= PD_IOMMUV2_MASK; 3383 - domain->updated = true; 3384 3339 3385 3340 update_domain(domain); 3386 3341
+3 -1
drivers/iommu/amd_iommu_types.h
··· 475 475 int glx; /* Number of levels for GCR3 table */ 476 476 u64 *gcr3_tbl; /* Guest CR3 table */ 477 477 unsigned long flags; /* flags to find out type of domain */ 478 - bool updated; /* complete domain flush required */ 479 478 unsigned dev_cnt; /* devices assigned to this domain */ 480 479 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ 481 480 }; ··· 633 634 * This struct contains device specific data for the IOMMU 634 635 */ 635 636 struct iommu_dev_data { 637 + /*Protect against attach/detach races */ 638 + spinlock_t lock; 639 + 636 640 struct list_head list; /* For domain->dev_list */ 637 641 struct llist_node dev_data_list; /* For global dev_data_list */ 638 642 struct protection_domain *domain; /* Domain the device is bound to */