Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'iommu-fixes-v5.7-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull iommu fixes from Joerg Roedel:

- Race condition fixes for the AMD IOMMU driver.

These are five patches fixing two race conditions around
increase_address_space(). The first race condition was around the
non-atomic update of the domain page-table root pointer and the
variable containing the page-table depth (called mode). This is fixed
now be merging page-table root and mode into one 64-bit field which
is read/written atomically.

The second race condition was around updating the page-table root
pointer and making it public before the hardware caches were flushed.
This could cause addresses to be mapped and returned to drivers which
are not reachable by IOMMU hardware yet, causing IO page-faults. This
is fixed too by adding the necessary flushes before a new page-table
root is published.

Related to the race condition fixes these patches also add a missing
domain_flush_complete() barrier to update_domain() and a fix to bail
out of the loop which tries to increase the address space when the
call to increase_address_space() fails.

Qian was able to trigger the race conditions under high load and
memory pressure within a few days of testing. He confirmed that he
has seen no issues anymore with the fixes included here.

- Fix for a list-handling bug in the VirtIO IOMMU driver.

* tag 'iommu-fixes-v5.7-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
iommu/virtio: Reverse arguments to list_add
iommu/amd: Do not flush Device Table in iommu_map_page()
iommu/amd: Update Device Table in increase_address_space()
iommu/amd: Call domain_flush_complete() in update_domain()
iommu/amd: Do not loop forever when trying to increase address space
iommu/amd: Fix race in increase_address_space()/fetch_pte()

+162 -47
+154 -44
drivers/iommu/amd_iommu.c
··· 101 101 static void update_domain(struct protection_domain *domain); 102 102 static int protection_domain_init(struct protection_domain *domain); 103 103 static void detach_device(struct device *dev); 104 + static void update_and_flush_device_table(struct protection_domain *domain, 105 + struct domain_pgtable *pgtable); 104 106 105 107 /**************************************************************************** 106 108 * ··· 151 149 static struct protection_domain *to_pdomain(struct iommu_domain *dom) 152 150 { 153 151 return container_of(dom, struct protection_domain, domain); 152 + } 153 + 154 + static void amd_iommu_domain_get_pgtable(struct protection_domain *domain, 155 + struct domain_pgtable *pgtable) 156 + { 157 + u64 pt_root = atomic64_read(&domain->pt_root); 158 + 159 + pgtable->root = (u64 *)(pt_root & PAGE_MASK); 160 + pgtable->mode = pt_root & 7; /* lowest 3 bits encode pgtable mode */ 161 + } 162 + 163 + static u64 amd_iommu_domain_encode_pgtable(u64 *root, int mode) 164 + { 165 + u64 pt_root; 166 + 167 + /* lowest 3 bits encode pgtable mode */ 168 + pt_root = mode & 7; 169 + pt_root |= (u64)root; 170 + 171 + return pt_root; 154 172 } 155 173 156 174 static struct iommu_dev_data *alloc_dev_data(u16 devid) ··· 1419 1397 1420 1398 static void free_pagetable(struct protection_domain *domain) 1421 1399 { 1422 - unsigned long root = (unsigned long)domain->pt_root; 1400 + struct domain_pgtable pgtable; 1423 1401 struct page *freelist = NULL; 1402 + unsigned long root; 1424 1403 1425 - BUG_ON(domain->mode < PAGE_MODE_NONE || 1426 - domain->mode > PAGE_MODE_6_LEVEL); 1404 + amd_iommu_domain_get_pgtable(domain, &pgtable); 1405 + atomic64_set(&domain->pt_root, 0); 1427 1406 1428 - freelist = free_sub_pt(root, domain->mode, freelist); 1407 + BUG_ON(pgtable.mode < PAGE_MODE_NONE || 1408 + pgtable.mode > PAGE_MODE_6_LEVEL); 1409 + 1410 + root = (unsigned long)pgtable.root; 1411 + freelist = free_sub_pt(root, pgtable.mode, freelist); 1429 1412 1430 1413 free_page_list(freelist); 1431 1414 } ··· 1444 1417 unsigned long address, 1445 1418 gfp_t gfp) 1446 1419 { 1420 + struct domain_pgtable pgtable; 1447 1421 unsigned long flags; 1448 - bool ret = false; 1449 - u64 *pte; 1422 + bool ret = true; 1423 + u64 *pte, root; 1450 1424 1451 1425 spin_lock_irqsave(&domain->lock, flags); 1452 1426 1453 - if (address <= PM_LEVEL_SIZE(domain->mode) || 1454 - WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL)) 1427 + amd_iommu_domain_get_pgtable(domain, &pgtable); 1428 + 1429 + if (address <= PM_LEVEL_SIZE(pgtable.mode)) 1430 + goto out; 1431 + 1432 + ret = false; 1433 + if (WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL)) 1455 1434 goto out; 1456 1435 1457 1436 pte = (void *)get_zeroed_page(gfp); 1458 1437 if (!pte) 1459 1438 goto out; 1460 1439 1461 - *pte = PM_LEVEL_PDE(domain->mode, 1462 - iommu_virt_to_phys(domain->pt_root)); 1463 - domain->pt_root = pte; 1464 - domain->mode += 1; 1440 + *pte = PM_LEVEL_PDE(pgtable.mode, iommu_virt_to_phys(pgtable.root)); 1441 + 1442 + pgtable.root = pte; 1443 + pgtable.mode += 1; 1444 + update_and_flush_device_table(domain, &pgtable); 1445 + domain_flush_complete(domain); 1446 + 1447 + /* 1448 + * Device Table needs to be updated and flushed before the new root can 1449 + * be published. 1450 + */ 1451 + root = amd_iommu_domain_encode_pgtable(pte, pgtable.mode); 1452 + atomic64_set(&domain->pt_root, root); 1465 1453 1466 1454 ret = true; 1467 1455 ··· 1493 1451 gfp_t gfp, 1494 1452 bool *updated) 1495 1453 { 1454 + struct domain_pgtable pgtable; 1496 1455 int level, end_lvl; 1497 1456 u64 *pte, *page; 1498 1457 1499 1458 BUG_ON(!is_power_of_2(page_size)); 1500 1459 1501 - while (address > PM_LEVEL_SIZE(domain->mode)) 1502 - *updated = increase_address_space(domain, address, gfp) || *updated; 1460 + amd_iommu_domain_get_pgtable(domain, &pgtable); 1503 1461 1504 - level = domain->mode - 1; 1505 - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 1462 + while (address > PM_LEVEL_SIZE(pgtable.mode)) { 1463 + /* 1464 + * Return an error if there is no memory to update the 1465 + * page-table. 1466 + */ 1467 + if (!increase_address_space(domain, address, gfp)) 1468 + return NULL; 1469 + 1470 + /* Read new values to check if update was successful */ 1471 + amd_iommu_domain_get_pgtable(domain, &pgtable); 1472 + } 1473 + 1474 + 1475 + level = pgtable.mode - 1; 1476 + pte = &pgtable.root[PM_LEVEL_INDEX(level, address)]; 1506 1477 address = PAGE_SIZE_ALIGN(address, page_size); 1507 1478 end_lvl = PAGE_SIZE_LEVEL(page_size); 1508 1479 ··· 1591 1536 unsigned long address, 1592 1537 unsigned long *page_size) 1593 1538 { 1539 + struct domain_pgtable pgtable; 1594 1540 int level; 1595 1541 u64 *pte; 1596 1542 1597 1543 *page_size = 0; 1598 1544 1599 - if (address > PM_LEVEL_SIZE(domain->mode)) 1545 + amd_iommu_domain_get_pgtable(domain, &pgtable); 1546 + 1547 + if (address > PM_LEVEL_SIZE(pgtable.mode)) 1600 1548 return NULL; 1601 1549 1602 - level = domain->mode - 1; 1603 - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; 1550 + level = pgtable.mode - 1; 1551 + pte = &pgtable.root[PM_LEVEL_INDEX(level, address)]; 1604 1552 *page_size = PTE_LEVEL_PAGE_SIZE(level); 1605 1553 1606 1554 while (level > 0) { ··· 1718 1660 unsigned long flags; 1719 1661 1720 1662 spin_lock_irqsave(&dom->lock, flags); 1721 - update_domain(dom); 1663 + /* 1664 + * Flush domain TLB(s) and wait for completion. Any Device-Table 1665 + * Updates and flushing already happened in 1666 + * increase_address_space(). 1667 + */ 1668 + domain_flush_tlb_pde(dom); 1669 + domain_flush_complete(dom); 1722 1670 spin_unlock_irqrestore(&dom->lock, flags); 1723 1671 } 1724 1672 ··· 1870 1806 static struct protection_domain *dma_ops_domain_alloc(void) 1871 1807 { 1872 1808 struct protection_domain *domain; 1809 + u64 *pt_root, root; 1873 1810 1874 1811 domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL); 1875 1812 if (!domain) ··· 1879 1814 if (protection_domain_init(domain)) 1880 1815 goto free_domain; 1881 1816 1882 - domain->mode = PAGE_MODE_3_LEVEL; 1883 - domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1884 - domain->flags = PD_DMA_OPS_MASK; 1885 - if (!domain->pt_root) 1817 + pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1818 + if (!pt_root) 1886 1819 goto free_domain; 1820 + 1821 + root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL); 1822 + atomic64_set(&domain->pt_root, root); 1823 + domain->flags = PD_DMA_OPS_MASK; 1887 1824 1888 1825 if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM) 1889 1826 goto free_domain; ··· 1908 1841 } 1909 1842 1910 1843 static void set_dte_entry(u16 devid, struct protection_domain *domain, 1844 + struct domain_pgtable *pgtable, 1911 1845 bool ats, bool ppr) 1912 1846 { 1913 1847 u64 pte_root = 0; 1914 1848 u64 flags = 0; 1915 1849 u32 old_domid; 1916 1850 1917 - if (domain->mode != PAGE_MODE_NONE) 1918 - pte_root = iommu_virt_to_phys(domain->pt_root); 1851 + if (pgtable->mode != PAGE_MODE_NONE) 1852 + pte_root = iommu_virt_to_phys(pgtable->root); 1919 1853 1920 - pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 1854 + pte_root |= (pgtable->mode & DEV_ENTRY_MODE_MASK) 1921 1855 << DEV_ENTRY_MODE_SHIFT; 1922 1856 pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV; 1923 1857 ··· 1991 1923 static void do_attach(struct iommu_dev_data *dev_data, 1992 1924 struct protection_domain *domain) 1993 1925 { 1926 + struct domain_pgtable pgtable; 1994 1927 struct amd_iommu *iommu; 1995 1928 bool ats; 1996 1929 ··· 2007 1938 domain->dev_cnt += 1; 2008 1939 2009 1940 /* Update device table */ 2010 - set_dte_entry(dev_data->devid, domain, ats, dev_data->iommu_v2); 1941 + amd_iommu_domain_get_pgtable(domain, &pgtable); 1942 + set_dte_entry(dev_data->devid, domain, &pgtable, 1943 + ats, dev_data->iommu_v2); 2011 1944 clone_aliases(dev_data->pdev); 2012 1945 2013 1946 device_flush_dte(dev_data); ··· 2320 2249 * 2321 2250 *****************************************************************************/ 2322 2251 2323 - static void update_device_table(struct protection_domain *domain) 2252 + static void update_device_table(struct protection_domain *domain, 2253 + struct domain_pgtable *pgtable) 2324 2254 { 2325 2255 struct iommu_dev_data *dev_data; 2326 2256 2327 2257 list_for_each_entry(dev_data, &domain->dev_list, list) { 2328 - set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled, 2329 - dev_data->iommu_v2); 2258 + set_dte_entry(dev_data->devid, domain, pgtable, 2259 + dev_data->ats.enabled, dev_data->iommu_v2); 2330 2260 clone_aliases(dev_data->pdev); 2331 2261 } 2332 2262 } 2333 2263 2264 + static void update_and_flush_device_table(struct protection_domain *domain, 2265 + struct domain_pgtable *pgtable) 2266 + { 2267 + update_device_table(domain, pgtable); 2268 + domain_flush_devices(domain); 2269 + } 2270 + 2334 2271 static void update_domain(struct protection_domain *domain) 2335 2272 { 2336 - update_device_table(domain); 2273 + struct domain_pgtable pgtable; 2337 2274 2338 - domain_flush_devices(domain); 2275 + /* Update device table */ 2276 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2277 + update_and_flush_device_table(domain, &pgtable); 2278 + 2279 + /* Flush domain TLB(s) and wait for completion */ 2339 2280 domain_flush_tlb_pde(domain); 2281 + domain_flush_complete(domain); 2340 2282 } 2341 2283 2342 2284 int __init amd_iommu_init_api(void) ··· 2459 2375 static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) 2460 2376 { 2461 2377 struct protection_domain *pdomain; 2378 + u64 *pt_root, root; 2462 2379 2463 2380 switch (type) { 2464 2381 case IOMMU_DOMAIN_UNMANAGED: ··· 2467 2382 if (!pdomain) 2468 2383 return NULL; 2469 2384 2470 - pdomain->mode = PAGE_MODE_3_LEVEL; 2471 - pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2472 - if (!pdomain->pt_root) { 2385 + pt_root = (void *)get_zeroed_page(GFP_KERNEL); 2386 + if (!pt_root) { 2473 2387 protection_domain_free(pdomain); 2474 2388 return NULL; 2475 2389 } 2390 + 2391 + root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL); 2392 + atomic64_set(&pdomain->pt_root, root); 2476 2393 2477 2394 pdomain->domain.geometry.aperture_start = 0; 2478 2395 pdomain->domain.geometry.aperture_end = ~0ULL; ··· 2493 2406 if (!pdomain) 2494 2407 return NULL; 2495 2408 2496 - pdomain->mode = PAGE_MODE_NONE; 2409 + atomic64_set(&pdomain->pt_root, PAGE_MODE_NONE); 2497 2410 break; 2498 2411 default: 2499 2412 return NULL; ··· 2505 2418 static void amd_iommu_domain_free(struct iommu_domain *dom) 2506 2419 { 2507 2420 struct protection_domain *domain; 2421 + struct domain_pgtable pgtable; 2508 2422 2509 2423 domain = to_pdomain(dom); 2510 2424 ··· 2523 2435 dma_ops_domain_free(domain); 2524 2436 break; 2525 2437 default: 2526 - if (domain->mode != PAGE_MODE_NONE) 2438 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2439 + 2440 + if (pgtable.mode != PAGE_MODE_NONE) 2527 2441 free_pagetable(domain); 2528 2442 2529 2443 if (domain->flags & PD_IOMMUV2_MASK) ··· 2608 2518 gfp_t gfp) 2609 2519 { 2610 2520 struct protection_domain *domain = to_pdomain(dom); 2521 + struct domain_pgtable pgtable; 2611 2522 int prot = 0; 2612 2523 int ret; 2613 2524 2614 - if (domain->mode == PAGE_MODE_NONE) 2525 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2526 + if (pgtable.mode == PAGE_MODE_NONE) 2615 2527 return -EINVAL; 2616 2528 2617 2529 if (iommu_prot & IOMMU_READ) ··· 2633 2541 struct iommu_iotlb_gather *gather) 2634 2542 { 2635 2543 struct protection_domain *domain = to_pdomain(dom); 2544 + struct domain_pgtable pgtable; 2636 2545 2637 - if (domain->mode == PAGE_MODE_NONE) 2546 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2547 + if (pgtable.mode == PAGE_MODE_NONE) 2638 2548 return 0; 2639 2549 2640 2550 return iommu_unmap_page(domain, iova, page_size); ··· 2647 2553 { 2648 2554 struct protection_domain *domain = to_pdomain(dom); 2649 2555 unsigned long offset_mask, pte_pgsize; 2556 + struct domain_pgtable pgtable; 2650 2557 u64 *pte, __pte; 2651 2558 2652 - if (domain->mode == PAGE_MODE_NONE) 2559 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2560 + if (pgtable.mode == PAGE_MODE_NONE) 2653 2561 return iova; 2654 2562 2655 2563 pte = fetch_pte(domain, iova, &pte_pgsize); ··· 2804 2708 void amd_iommu_domain_direct_map(struct iommu_domain *dom) 2805 2709 { 2806 2710 struct protection_domain *domain = to_pdomain(dom); 2711 + struct domain_pgtable pgtable; 2807 2712 unsigned long flags; 2713 + u64 pt_root; 2808 2714 2809 2715 spin_lock_irqsave(&domain->lock, flags); 2810 2716 2717 + /* First save pgtable configuration*/ 2718 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2719 + 2811 2720 /* Update data structure */ 2812 - domain->mode = PAGE_MODE_NONE; 2721 + pt_root = amd_iommu_domain_encode_pgtable(NULL, PAGE_MODE_NONE); 2722 + atomic64_set(&domain->pt_root, pt_root); 2813 2723 2814 2724 /* Make changes visible to IOMMUs */ 2815 2725 update_domain(domain); 2726 + 2727 + /* Restore old pgtable in domain->ptroot to free page-table */ 2728 + pt_root = amd_iommu_domain_encode_pgtable(pgtable.root, pgtable.mode); 2729 + atomic64_set(&domain->pt_root, pt_root); 2816 2730 2817 2731 /* Page-table is not visible to IOMMU anymore, so free it */ 2818 2732 free_pagetable(domain); ··· 3014 2908 static int __set_gcr3(struct protection_domain *domain, int pasid, 3015 2909 unsigned long cr3) 3016 2910 { 2911 + struct domain_pgtable pgtable; 3017 2912 u64 *pte; 3018 2913 3019 - if (domain->mode != PAGE_MODE_NONE) 2914 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2915 + if (pgtable.mode != PAGE_MODE_NONE) 3020 2916 return -EINVAL; 3021 2917 3022 2918 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); ··· 3032 2924 3033 2925 static int __clear_gcr3(struct protection_domain *domain, int pasid) 3034 2926 { 2927 + struct domain_pgtable pgtable; 3035 2928 u64 *pte; 3036 2929 3037 - if (domain->mode != PAGE_MODE_NONE) 2930 + amd_iommu_domain_get_pgtable(domain, &pgtable); 2931 + if (pgtable.mode != PAGE_MODE_NONE) 3038 2932 return -EINVAL; 3039 2933 3040 2934 pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);
+7 -2
drivers/iommu/amd_iommu_types.h
··· 468 468 iommu core code */ 469 469 spinlock_t lock; /* mostly used to lock the page table*/ 470 470 u16 id; /* the domain id written to the device table */ 471 - int mode; /* paging mode (0-6 levels) */ 472 - u64 *pt_root; /* page table root pointer */ 471 + atomic64_t pt_root; /* pgtable root and pgtable mode */ 473 472 int glx; /* Number of levels for GCR3 table */ 474 473 u64 *gcr3_tbl; /* Guest CR3 table */ 475 474 unsigned long flags; /* flags to find out type of domain */ 476 475 unsigned dev_cnt; /* devices assigned to this domain */ 477 476 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ 477 + }; 478 + 479 + /* For decocded pt_root */ 480 + struct domain_pgtable { 481 + int mode; 482 + u64 *root; 478 483 }; 479 484 480 485 /*
+1 -1
drivers/iommu/virtio-iommu.c
··· 453 453 if (!region) 454 454 return -ENOMEM; 455 455 456 - list_add(&vdev->resv_regions, &region->list); 456 + list_add(&region->list, &vdev->resv_regions); 457 457 return 0; 458 458 } 459 459