Merge tag 'iommu-updates-v7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux

+7 -1

Documentation/arch/arm64/silicon-errata.rst

··· 207 207 +----------------+-----------------+-----------------+-----------------------------+ 208 208 | ARM | MMU-600 | #1076982,1209401| N/A | 209 209 +----------------+-----------------+-----------------+-----------------------------+ 210 - | ARM | MMU-700 | #2268618,2812531| N/A | 210 + | ARM | MMU-700 | #2133013, | N/A | 211 + | | | #2268618, | | 212 + | | | #2812531, | | 213 + | | | #3777127 | | 211 214 +----------------+-----------------+-----------------+-----------------------------+ 215 + | ARM | MMU L1 | #3878312 | N/A | 216 + +----------------+-----------------+-----------------+-----------------------------+ 217 + | ARM | MMU S3 | #3995052 | N/A | 212 218 +----------------+-----------------+-----------------+-----------------------------+ 213 219 | ARM | GIC-700 | #2941627 | ARM64_ERRATUM_2941627 | 214 220 +----------------+-----------------+-----------------+-----------------------------+

+2

Documentation/devicetree/bindings/iommu/arm,smmu.yaml

··· 35 35 - description: Qcom SoCs implementing "qcom,smmu-500" and "arm,mmu-500" 36 36 items: 37 37 - enum: 38 + - qcom,eliza-smmu-500 38 39 - qcom,glymur-smmu-500 39 40 - qcom,kaanapali-smmu-500 40 41 - qcom,milos-smmu-500 ··· 93 92 items: 94 93 - enum: 95 94 - qcom,glymur-smmu-500 95 + - qcom,hawi-smmu-500 96 96 - qcom,kaanapali-smmu-500 97 97 - qcom,milos-smmu-500 98 98 - qcom,qcm2290-smmu-500

+31 -32

drivers/iommu/amd/debugfs.c

··· 26 26 { 27 27 struct seq_file *m = filp->private_data; 28 28 struct amd_iommu *iommu = m->private; 29 - int ret; 30 - 31 - iommu->dbg_mmio_offset = -1; 29 + int ret, dbg_mmio_offset = iommu->dbg_mmio_offset = -1; 32 30 33 31 if (cnt > OFS_IN_SZ) 34 32 return -EINVAL; 35 33 36 - ret = kstrtou32_from_user(ubuf, cnt, 0, &iommu->dbg_mmio_offset); 34 + ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_mmio_offset); 37 35 if (ret) 38 36 return ret; 39 37 40 - if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) { 41 - iommu->dbg_mmio_offset = -1; 42 - return -EINVAL; 43 - } 38 + if (dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) 39 + return -EINVAL; 44 40 41 + iommu->dbg_mmio_offset = dbg_mmio_offset; 45 42 return cnt; 46 43 } 47 44 ··· 46 49 { 47 50 struct amd_iommu *iommu = m->private; 48 51 u64 value; 52 + int dbg_mmio_offset = iommu->dbg_mmio_offset; 49 53 50 - if (iommu->dbg_mmio_offset < 0) { 54 + if (dbg_mmio_offset < 0 || dbg_mmio_offset > 55 + iommu->mmio_phys_end - sizeof(u64)) { 51 56 seq_puts(m, "Please provide mmio register's offset\n"); 52 57 return 0; 53 58 } 54 59 55 - value = readq(iommu->mmio_base + iommu->dbg_mmio_offset); 56 - seq_printf(m, "Offset:0x%x Value:0x%016llx\n", iommu->dbg_mmio_offset, value); 60 + value = readq(iommu->mmio_base + dbg_mmio_offset); 61 + seq_printf(m, "Offset:0x%x Value:0x%016llx\n", dbg_mmio_offset, value); 57 62 58 63 return 0; 59 64 } ··· 66 67 { 67 68 struct seq_file *m = filp->private_data; 68 69 struct amd_iommu *iommu = m->private; 69 - int ret; 70 - 71 - iommu->dbg_cap_offset = -1; 70 + int ret, dbg_cap_offset = iommu->dbg_cap_offset = -1; 72 71 73 72 if (cnt > OFS_IN_SZ) 74 73 return -EINVAL; 75 74 76 - ret = kstrtou32_from_user(ubuf, cnt, 0, &iommu->dbg_cap_offset); 75 + ret = kstrtou32_from_user(ubuf, cnt, 0, &dbg_cap_offset); 77 76 if (ret) 78 77 return ret; 79 78 80 79 /* Capability register at offset 0x14 is the last IOMMU capability register. */ 81 - if (iommu->dbg_cap_offset > 0x14) { 82 - iommu->dbg_cap_offset = -1; 80 + if (dbg_cap_offset > 0x14) 83 81 return -EINVAL; 84 - } 85 82 83 + iommu->dbg_cap_offset = dbg_cap_offset; 86 84 return cnt; 87 85 } 88 86 ··· 87 91 { 88 92 struct amd_iommu *iommu = m->private; 89 93 u32 value; 90 - int err; 94 + int err, dbg_cap_offset = iommu->dbg_cap_offset; 91 95 92 - if (iommu->dbg_cap_offset < 0) { 96 + if (dbg_cap_offset < 0 || dbg_cap_offset > 0x14) { 93 97 seq_puts(m, "Please provide capability register's offset in the range [0x00 - 0x14]\n"); 94 98 return 0; 95 99 } 96 100 97 - err = pci_read_config_dword(iommu->dev, iommu->cap_ptr + iommu->dbg_cap_offset, &value); 101 + err = pci_read_config_dword(iommu->dev, iommu->cap_ptr + dbg_cap_offset, &value); 98 102 if (err) { 99 103 seq_printf(m, "Not able to read capability register at 0x%x\n", 100 - iommu->dbg_cap_offset); 104 + dbg_cap_offset); 101 105 return 0; 102 106 } 103 107 104 - seq_printf(m, "Offset:0x%x Value:0x%08x\n", iommu->dbg_cap_offset, value); 108 + seq_printf(m, "Offset:0x%x Value:0x%08x\n", dbg_cap_offset, value); 105 109 106 110 return 0; 107 111 } ··· 193 197 static int devid_show(struct seq_file *m, void *unused) 194 198 { 195 199 u16 devid; 200 + int sbdf_shadow = sbdf; 196 201 197 - if (sbdf >= 0) { 198 - devid = PCI_SBDF_TO_DEVID(sbdf); 199 - seq_printf(m, "%04x:%02x:%02x.%x\n", PCI_SBDF_TO_SEGID(sbdf), 202 + if (sbdf_shadow >= 0) { 203 + devid = PCI_SBDF_TO_DEVID(sbdf_shadow); 204 + seq_printf(m, "%04x:%02x:%02x.%x\n", PCI_SBDF_TO_SEGID(sbdf_shadow), 200 205 PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid)); 201 206 } else 202 207 seq_puts(m, "No or Invalid input provided\n"); ··· 234 237 { 235 238 struct amd_iommu_pci_seg *pci_seg; 236 239 u16 seg, devid; 240 + int sbdf_shadow = sbdf; 237 241 238 - if (sbdf < 0) { 242 + if (sbdf_shadow < 0) { 239 243 seq_puts(m, "Enter a valid device ID to 'devid' file\n"); 240 244 return 0; 241 245 } 242 - seg = PCI_SBDF_TO_SEGID(sbdf); 243 - devid = PCI_SBDF_TO_DEVID(sbdf); 246 + seg = PCI_SBDF_TO_SEGID(sbdf_shadow); 247 + devid = PCI_SBDF_TO_DEVID(sbdf_shadow); 244 248 245 249 for_each_pci_segment(pci_seg) { 246 250 if (pci_seg->id != seg) ··· 334 336 { 335 337 struct amd_iommu_pci_seg *pci_seg; 336 338 u16 devid, seg; 339 + int sbdf_shadow = sbdf; 337 340 338 341 if (!irq_remapping_enabled) { 339 342 seq_puts(m, "Interrupt remapping is disabled\n"); 340 343 return 0; 341 344 } 342 345 343 - if (sbdf < 0) { 346 + if (sbdf_shadow < 0) { 344 347 seq_puts(m, "Enter a valid device ID to 'devid' file\n"); 345 348 return 0; 346 349 } 347 350 348 - seg = PCI_SBDF_TO_SEGID(sbdf); 349 - devid = PCI_SBDF_TO_DEVID(sbdf); 351 + seg = PCI_SBDF_TO_SEGID(sbdf_shadow); 352 + devid = PCI_SBDF_TO_DEVID(sbdf_shadow); 350 353 351 354 for_each_pci_segment(pci_seg) { 352 355 if (pci_seg->id != seg)

+6 -3

drivers/iommu/amd/init.c

··· 848 848 void *__init iommu_alloc_4k_pages(struct amd_iommu *iommu, gfp_t gfp, 849 849 size_t size) 850 850 { 851 + int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 851 852 void *buf; 852 853 853 854 size = PAGE_ALIGN(size); 854 - buf = iommu_alloc_pages_sz(gfp, size); 855 + buf = iommu_alloc_pages_node_sz(nid, gfp, size); 855 856 if (!buf) 856 857 return NULL; 857 858 if (check_feature(FEATURE_SNP) && ··· 955 954 956 955 static int iommu_init_ga_log(struct amd_iommu *iommu) 957 956 { 957 + int nid = iommu->dev ? dev_to_node(&iommu->dev->dev) : NUMA_NO_NODE; 958 + 958 959 if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) 959 960 return 0; 960 961 961 - iommu->ga_log = iommu_alloc_pages_sz(GFP_KERNEL, GA_LOG_SIZE); 962 + iommu->ga_log = iommu_alloc_pages_node_sz(nid, GFP_KERNEL, GA_LOG_SIZE); 962 963 if (!iommu->ga_log) 963 964 goto err_out; 964 965 965 - iommu->ga_log_tail = iommu_alloc_pages_sz(GFP_KERNEL, 8); 966 + iommu->ga_log_tail = iommu_alloc_pages_node_sz(nid, GFP_KERNEL, 8); 966 967 if (!iommu->ga_log_tail) 967 968 goto err_out; 968 969

+33 -10

drivers/iommu/amd/iommu.c

··· 403 403 return NULL; 404 404 } 405 405 406 - static int clone_alias(struct pci_dev *pdev, u16 alias, void *data) 406 + static int clone_alias(struct pci_dev *pdev_origin, u16 alias, void *data) 407 407 { 408 408 struct dev_table_entry new; 409 409 struct amd_iommu *iommu; 410 410 struct iommu_dev_data *dev_data, *alias_data; 411 + struct pci_dev *pdev = data; 411 412 u16 devid = pci_dev_id(pdev); 412 413 int ret = 0; 413 414 ··· 455 454 * part of the PCI DMA aliases if it's bus differs 456 455 * from the original device. 457 456 */ 458 - clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], NULL); 457 + clone_alias(pdev, iommu->pci_seg->alias_table[pci_dev_id(pdev)], pdev); 459 458 460 - pci_for_each_dma_alias(pdev, clone_alias, NULL); 459 + pci_for_each_dma_alias(pdev, clone_alias, pdev); 461 460 } 462 461 463 462 static void setup_aliases(struct amd_iommu *iommu, struct device *dev) ··· 2992 2991 return amdr_ivrs_remap_support; 2993 2992 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: 2994 2993 return true; 2995 - case IOMMU_CAP_DEFERRED_FLUSH: 2996 - return true; 2997 2994 case IOMMU_CAP_DIRTY_TRACKING: { 2998 2995 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2999 2996 3000 2997 return amd_iommu_hd_support(iommu); 2998 + } 2999 + case IOMMU_CAP_PCI_ATS_SUPPORTED: { 3000 + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 3001 + 3002 + return amd_iommu_iotlb_sup && 3003 + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); 3001 3004 } 3002 3005 default: 3003 3006 break; ··· 3184 3179 static struct irq_chip amd_ir_chip; 3185 3180 static DEFINE_SPINLOCK(iommu_table_lock); 3186 3181 3182 + static int iommu_flush_dev_irt(struct pci_dev *unused, u16 devid, void *data) 3183 + { 3184 + int ret; 3185 + struct iommu_cmd cmd; 3186 + struct amd_iommu *iommu = data; 3187 + 3188 + build_inv_irt(&cmd, devid); 3189 + ret = __iommu_queue_command_sync(iommu, &cmd, true); 3190 + return ret; 3191 + } 3192 + 3187 3193 static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) 3188 3194 { 3189 3195 int ret; 3190 3196 u64 data; 3191 3197 unsigned long flags; 3192 - struct iommu_cmd cmd, cmd2; 3198 + struct iommu_cmd cmd; 3199 + struct pci_dev *pdev = NULL; 3200 + struct iommu_dev_data *dev_data = search_dev_data(iommu, devid); 3193 3201 3194 3202 if (iommu->irtcachedis_enabled) 3195 3203 return; 3196 3204 3197 - build_inv_irt(&cmd, devid); 3205 + if (dev_data && dev_data->dev && dev_is_pci(dev_data->dev)) 3206 + pdev = to_pci_dev(dev_data->dev); 3198 3207 3199 3208 raw_spin_lock_irqsave(&iommu->lock, flags); 3200 3209 data = get_cmdsem_val(iommu); 3201 - build_completion_wait(&cmd2, iommu, data); 3210 + build_completion_wait(&cmd, iommu, data); 3202 3211 3203 - ret = __iommu_queue_command_sync(iommu, &cmd, true); 3212 + if (pdev) 3213 + ret = pci_for_each_dma_alias(pdev, iommu_flush_dev_irt, iommu); 3214 + else 3215 + ret = iommu_flush_dev_irt(NULL, devid, iommu); 3204 3216 if (ret) 3205 3217 goto out_err; 3206 - ret = __iommu_queue_command_sync(iommu, &cmd2, false); 3218 + 3219 + ret = __iommu_queue_command_sync(iommu, &cmd, false); 3207 3220 if (ret) 3208 3221 goto out_err; 3209 3222 raw_spin_unlock_irqrestore(&iommu->lock, flags);

+7 -28

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c

··· 122 122 } 123 123 EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd); 124 124 125 - /* 126 - * Cloned from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, this 127 - * is used as a threshold to replace per-page TLBI commands to issue in the 128 - * command queue with an address-space TLBI command, when SMMU w/o a range 129 - * invalidation feature handles too many per-page TLBI commands, which will 130 - * otherwise result in a soft lockup. 131 - */ 132 - #define CMDQ_MAX_TLBI_OPS (1 << (PAGE_SHIFT - 3)) 133 - 134 125 static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, 135 126 struct mm_struct *mm, 136 127 unsigned long start, ··· 137 146 * range. So do a simple translation here by calculating size correctly. 138 147 */ 139 148 size = end - start; 140 - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_RANGE_INV)) { 141 - if (size >= CMDQ_MAX_TLBI_OPS * PAGE_SIZE) 142 - size = 0; 143 - } else { 144 - if (size == ULONG_MAX) 145 - size = 0; 146 - } 147 149 148 - if (!size) 149 - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid); 150 - else 151 - arm_smmu_tlb_inv_range_asid(start, size, smmu_domain->cd.asid, 152 - PAGE_SIZE, false, smmu_domain); 153 - 154 - arm_smmu_atc_inv_domain(smmu_domain, start, size); 150 + arm_smmu_domain_inv_range(smmu_domain, start, size, PAGE_SIZE, false); 155 151 } 156 152 157 153 static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) ··· 169 191 } 170 192 spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); 171 193 172 - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid); 173 - arm_smmu_atc_inv_domain(smmu_domain, 0, 0); 194 + arm_smmu_domain_inv(smmu_domain); 174 195 } 175 196 176 197 static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) 177 198 { 178 - kfree(container_of(mn, struct arm_smmu_domain, mmu_notifier)); 199 + arm_smmu_domain_free( 200 + container_of(mn, struct arm_smmu_domain, mmu_notifier)); 179 201 } 180 202 181 203 static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { ··· 279 301 /* 280 302 * Ensure the ASID is empty in the iommu cache before allowing reuse. 281 303 */ 282 - arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid); 304 + arm_smmu_domain_inv(smmu_domain); 283 305 284 306 /* 285 307 * Notice that the arm_smmu_mm_arch_invalidate_secondary_tlbs op can ··· 324 346 * ARM_SMMU_FEAT_RANGE_INV is present 325 347 */ 326 348 smmu_domain->domain.pgsize_bitmap = PAGE_SIZE; 349 + smmu_domain->stage = ARM_SMMU_DOMAIN_SVA; 327 350 smmu_domain->smmu = smmu; 328 351 329 352 ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain, ··· 343 364 err_asid: 344 365 xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid); 345 366 err_free: 346 - kfree(smmu_domain); 367 + arm_smmu_domain_free(smmu_domain); 347 368 return ERR_PTR(ret); 348 369 }

+135

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c

··· 637 637 NUM_EXPECTED_SYNCS(2)); 638 638 } 639 639 640 + static void arm_smmu_v3_invs_test_verify(struct kunit *test, 641 + struct arm_smmu_invs *invs, 642 + int num_invs, const int num_trashes, 643 + const int *ids, const int *users, 644 + const int *ssids) 645 + { 646 + KUNIT_EXPECT_EQ(test, invs->num_invs, num_invs); 647 + KUNIT_EXPECT_EQ(test, invs->num_trashes, num_trashes); 648 + while (num_invs--) { 649 + KUNIT_EXPECT_EQ(test, invs->inv[num_invs].id, ids[num_invs]); 650 + KUNIT_EXPECT_EQ(test, READ_ONCE(invs->inv[num_invs].users), 651 + users[num_invs]); 652 + KUNIT_EXPECT_EQ(test, invs->inv[num_invs].ssid, ssids[num_invs]); 653 + } 654 + } 655 + 656 + static struct arm_smmu_invs invs1 = { 657 + .num_invs = 3, 658 + .inv = { { .type = INV_TYPE_S2_VMID, .id = 1, }, 659 + { .type = INV_TYPE_S2_VMID_S1_CLEAR, .id = 1, }, 660 + { .type = INV_TYPE_ATS, .id = 3, }, }, 661 + }; 662 + 663 + static struct arm_smmu_invs invs2 = { 664 + .num_invs = 3, 665 + .inv = { { .type = INV_TYPE_S2_VMID, .id = 1, }, /* duplicated */ 666 + { .type = INV_TYPE_ATS, .id = 4, }, 667 + { .type = INV_TYPE_ATS, .id = 5, }, }, 668 + }; 669 + 670 + static struct arm_smmu_invs invs3 = { 671 + .num_invs = 3, 672 + .inv = { { .type = INV_TYPE_S2_VMID, .id = 1, }, /* duplicated */ 673 + { .type = INV_TYPE_ATS, .id = 5, }, /* recover a trash */ 674 + { .type = INV_TYPE_ATS, .id = 6, }, }, 675 + }; 676 + 677 + static struct arm_smmu_invs invs4 = { 678 + .num_invs = 3, 679 + .inv = { { .type = INV_TYPE_ATS, .id = 10, .ssid = 1 }, 680 + { .type = INV_TYPE_ATS, .id = 10, .ssid = 3 }, 681 + { .type = INV_TYPE_ATS, .id = 12, .ssid = 1 }, }, 682 + }; 683 + 684 + static struct arm_smmu_invs invs5 = { 685 + .num_invs = 3, 686 + .inv = { { .type = INV_TYPE_ATS, .id = 10, .ssid = 2 }, 687 + { .type = INV_TYPE_ATS, .id = 10, .ssid = 3 }, /* duplicate */ 688 + { .type = INV_TYPE_ATS, .id = 12, .ssid = 2 }, }, 689 + }; 690 + 691 + static void arm_smmu_v3_invs_test(struct kunit *test) 692 + { 693 + const int results1[3][3] = { { 1, 1, 3, }, { 1, 1, 1, }, { 0, 0, 0, } }; 694 + const int results2[3][5] = { { 1, 1, 3, 4, 5, }, { 2, 1, 1, 1, 1, }, { 0, 0, 0, 0, 0, } }; 695 + const int results3[3][3] = { { 1, 1, 3, }, { 1, 1, 1, }, { 0, 0, 0, } }; 696 + const int results4[3][5] = { { 1, 1, 3, 5, 6, }, { 2, 1, 1, 1, 1, }, { 0, 0, 0, 0, 0, } }; 697 + const int results5[3][5] = { { 1, 1, 3, 5, 6, }, { 1, 0, 0, 1, 1, }, { 0, 0, 0, 0, 0, } }; 698 + const int results6[3][3] = { { 1, 5, 6, }, { 1, 1, 1, }, { 0, 0, 0, } }; 699 + const int results7[3][3] = { { 10, 10, 12, }, { 1, 1, 1, }, { 1, 3, 1, } }; 700 + const int results8[3][5] = { { 10, 10, 10, 12, 12, }, { 1, 1, 2, 1, 1, }, { 1, 2, 3, 1, 2, } }; 701 + const int results9[3][4] = { { 10, 10, 10, 12, }, { 1, 0, 1, 1, }, { 1, 2, 3, 1, } }; 702 + const int results10[3][3] = { { 10, 10, 12, }, { 1, 1, 1, }, { 1, 3, 1, } }; 703 + struct arm_smmu_invs *test_a, *test_b; 704 + 705 + /* New array */ 706 + test_a = arm_smmu_invs_alloc(0); 707 + KUNIT_EXPECT_EQ(test, test_a->num_invs, 0); 708 + 709 + /* Test1: merge invs1 (new array) */ 710 + test_b = arm_smmu_invs_merge(test_a, &invs1); 711 + kfree(test_a); 712 + arm_smmu_v3_invs_test_verify(test, test_b, ARRAY_SIZE(results1[0]), 0, 713 + results1[0], results1[1], results1[2]); 714 + 715 + /* Test2: merge invs2 (new array) */ 716 + test_a = arm_smmu_invs_merge(test_b, &invs2); 717 + kfree(test_b); 718 + arm_smmu_v3_invs_test_verify(test, test_a, ARRAY_SIZE(results2[0]), 0, 719 + results2[0], results2[1], results2[2]); 720 + 721 + /* Test3: unref invs2 (same array) */ 722 + arm_smmu_invs_unref(test_a, &invs2); 723 + arm_smmu_v3_invs_test_verify(test, test_a, ARRAY_SIZE(results3[0]), 0, 724 + results3[0], results3[1], results3[2]); 725 + 726 + /* Test4: merge invs3 (new array) */ 727 + test_b = arm_smmu_invs_merge(test_a, &invs3); 728 + kfree(test_a); 729 + arm_smmu_v3_invs_test_verify(test, test_b, ARRAY_SIZE(results4[0]), 0, 730 + results4[0], results4[1], results4[2]); 731 + 732 + /* Test5: unref invs1 (same array) */ 733 + arm_smmu_invs_unref(test_b, &invs1); 734 + arm_smmu_v3_invs_test_verify(test, test_b, ARRAY_SIZE(results5[0]), 2, 735 + results5[0], results5[1], results5[2]); 736 + 737 + /* Test6: purge test_b (new array) */ 738 + test_a = arm_smmu_invs_purge(test_b); 739 + kfree(test_b); 740 + arm_smmu_v3_invs_test_verify(test, test_a, ARRAY_SIZE(results6[0]), 0, 741 + results6[0], results6[1], results6[2]); 742 + 743 + /* Test7: unref invs3 (same array) */ 744 + arm_smmu_invs_unref(test_a, &invs3); 745 + KUNIT_EXPECT_EQ(test, test_a->num_invs, 0); 746 + KUNIT_EXPECT_EQ(test, test_a->num_trashes, 0); 747 + 748 + /* Test8: merge invs4 (new array) */ 749 + test_b = arm_smmu_invs_merge(test_a, &invs4); 750 + kfree(test_a); 751 + arm_smmu_v3_invs_test_verify(test, test_b, ARRAY_SIZE(results7[0]), 0, 752 + results7[0], results7[1], results7[2]); 753 + 754 + /* Test9: merge invs5 (new array) */ 755 + test_a = arm_smmu_invs_merge(test_b, &invs5); 756 + kfree(test_b); 757 + arm_smmu_v3_invs_test_verify(test, test_a, ARRAY_SIZE(results8[0]), 0, 758 + results8[0], results8[1], results8[2]); 759 + 760 + /* Test10: unref invs5 (same array) */ 761 + arm_smmu_invs_unref(test_a, &invs5); 762 + arm_smmu_v3_invs_test_verify(test, test_a, ARRAY_SIZE(results9[0]), 1, 763 + results9[0], results9[1], results9[2]); 764 + 765 + /* Test11: purge test_a (new array) */ 766 + test_b = arm_smmu_invs_purge(test_a); 767 + kfree(test_a); 768 + arm_smmu_v3_invs_test_verify(test, test_b, ARRAY_SIZE(results10[0]), 0, 769 + results10[0], results10[1], results10[2]); 770 + 771 + kfree(test_b); 772 + } 773 + 640 774 static struct kunit_case arm_smmu_v3_test_cases[] = { 641 775 KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_abort), 642 776 KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_bypass), ··· 796 662 KUNIT_CASE(arm_smmu_v3_write_ste_test_nested_s1bypass_to_s1dssbypass), 797 663 KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear), 798 664 KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release), 665 + KUNIT_CASE(arm_smmu_v3_invs_test), 799 666 {}, 800 667 }; 801 668

+800 -156

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c

··· 26 26 #include <linux/pci.h> 27 27 #include <linux/pci-ats.h> 28 28 #include <linux/platform_device.h> 29 + #include <linux/sort.h> 29 30 #include <linux/string_choices.h> 30 31 #include <kunit/visibility.h> 31 32 #include <uapi/linux/iommufd.h> ··· 108 107 }; 109 108 110 109 static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); 110 + static bool arm_smmu_ats_supported(struct arm_smmu_master *master); 111 111 112 112 static void parse_driver_options(struct arm_smmu_device *smmu) 113 113 { ··· 1028 1026 */ 1029 1027 } 1030 1028 1031 - /* Context descriptor manipulation functions */ 1032 - void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid) 1029 + /* Invalidation array manipulation functions */ 1030 + static inline struct arm_smmu_inv * 1031 + arm_smmu_invs_iter_next(struct arm_smmu_invs *invs, size_t next, size_t *idx) 1033 1032 { 1034 - struct arm_smmu_cmdq_ent cmd = { 1035 - .opcode = smmu->features & ARM_SMMU_FEAT_E2H ? 1036 - CMDQ_OP_TLBI_EL2_ASID : CMDQ_OP_TLBI_NH_ASID, 1037 - .tlbi.asid = asid, 1038 - }; 1039 - 1040 - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); 1033 + while (true) { 1034 + if (next >= invs->num_invs) { 1035 + *idx = next; 1036 + return NULL; 1037 + } 1038 + if (!READ_ONCE(invs->inv[next].users)) { 1039 + next++; 1040 + continue; 1041 + } 1042 + *idx = next; 1043 + return &invs->inv[next]; 1044 + } 1041 1045 } 1046 + 1047 + /** 1048 + * arm_smmu_invs_for_each_entry - Iterate over all non-trash entries in invs 1049 + * @invs: the base invalidation array 1050 + * @idx: a stack variable of 'size_t', to store the array index 1051 + * @cur: a stack variable of 'struct arm_smmu_inv *' 1052 + */ 1053 + #define arm_smmu_invs_for_each_entry(invs, idx, cur) \ 1054 + for (cur = arm_smmu_invs_iter_next(invs, 0, &(idx)); cur; \ 1055 + cur = arm_smmu_invs_iter_next(invs, idx + 1, &(idx))) 1056 + 1057 + static int arm_smmu_inv_cmp(const struct arm_smmu_inv *inv_l, 1058 + const struct arm_smmu_inv *inv_r) 1059 + { 1060 + if (inv_l->smmu != inv_r->smmu) 1061 + return cmp_int((uintptr_t)inv_l->smmu, (uintptr_t)inv_r->smmu); 1062 + if (inv_l->type != inv_r->type) 1063 + return cmp_int(inv_l->type, inv_r->type); 1064 + if (inv_l->id != inv_r->id) 1065 + return cmp_int(inv_l->id, inv_r->id); 1066 + if (arm_smmu_inv_is_ats(inv_l)) 1067 + return cmp_int(inv_l->ssid, inv_r->ssid); 1068 + return 0; 1069 + } 1070 + 1071 + static inline int arm_smmu_invs_iter_next_cmp(struct arm_smmu_invs *invs_l, 1072 + size_t next_l, size_t *idx_l, 1073 + struct arm_smmu_invs *invs_r, 1074 + size_t next_r, size_t *idx_r) 1075 + { 1076 + struct arm_smmu_inv *cur_l = 1077 + arm_smmu_invs_iter_next(invs_l, next_l, idx_l); 1078 + 1079 + /* 1080 + * We have to update the idx_r manually, because the invs_r cannot call 1081 + * arm_smmu_invs_iter_next() as the invs_r never sets any users counter. 1082 + */ 1083 + *idx_r = next_r; 1084 + 1085 + /* 1086 + * Compare of two sorted arrays items. If one side is past the end of 1087 + * the array, return the other side to let it run out the iteration. 1088 + * 1089 + * If the left entry is empty, return 1 to pick the right entry. 1090 + * If the right entry is empty, return -1 to pick the left entry. 1091 + */ 1092 + if (!cur_l) 1093 + return 1; 1094 + if (next_r >= invs_r->num_invs) 1095 + return -1; 1096 + return arm_smmu_inv_cmp(cur_l, &invs_r->inv[next_r]); 1097 + } 1098 + 1099 + /** 1100 + * arm_smmu_invs_for_each_cmp - Iterate over two sorted arrays computing for 1101 + * arm_smmu_invs_merge() or arm_smmu_invs_unref() 1102 + * @invs_l: the base invalidation array 1103 + * @idx_l: a stack variable of 'size_t', to store the base array index 1104 + * @invs_r: the build_invs array as to_merge or to_unref 1105 + * @idx_r: a stack variable of 'size_t', to store the build_invs index 1106 + * @cmp: a stack variable of 'int', to store return value (-1, 0, or 1) 1107 + */ 1108 + #define arm_smmu_invs_for_each_cmp(invs_l, idx_l, invs_r, idx_r, cmp) \ 1109 + for (idx_l = idx_r = 0, \ 1110 + cmp = arm_smmu_invs_iter_next_cmp(invs_l, 0, &(idx_l), \ 1111 + invs_r, 0, &(idx_r)); \ 1112 + idx_l < invs_l->num_invs || idx_r < invs_r->num_invs; \ 1113 + cmp = arm_smmu_invs_iter_next_cmp( \ 1114 + invs_l, idx_l + (cmp <= 0 ? 1 : 0), &(idx_l), \ 1115 + invs_r, idx_r + (cmp >= 0 ? 1 : 0), &(idx_r))) 1116 + 1117 + /** 1118 + * arm_smmu_invs_merge() - Merge @to_merge into @invs and generate a new array 1119 + * @invs: the base invalidation array 1120 + * @to_merge: an array of invalidations to merge 1121 + * 1122 + * Return: a newly allocated array on success, or ERR_PTR 1123 + * 1124 + * This function must be locked and serialized with arm_smmu_invs_unref() and 1125 + * arm_smmu_invs_purge(), but do not lockdep on any lock for KUNIT test. 1126 + * 1127 + * Both @invs and @to_merge must be sorted, to ensure the returned array will be 1128 + * sorted as well. 1129 + * 1130 + * Caller is responsible for freeing the @invs and the returned new one. 1131 + * 1132 + * Entries marked as trash will be purged in the returned array. 1133 + */ 1134 + VISIBLE_IF_KUNIT 1135 + struct arm_smmu_invs *arm_smmu_invs_merge(struct arm_smmu_invs *invs, 1136 + struct arm_smmu_invs *to_merge) 1137 + { 1138 + struct arm_smmu_invs *new_invs; 1139 + struct arm_smmu_inv *new; 1140 + size_t num_invs = 0; 1141 + size_t i, j; 1142 + int cmp; 1143 + 1144 + arm_smmu_invs_for_each_cmp(invs, i, to_merge, j, cmp) 1145 + num_invs++; 1146 + 1147 + new_invs = arm_smmu_invs_alloc(num_invs); 1148 + if (!new_invs) 1149 + return ERR_PTR(-ENOMEM); 1150 + 1151 + new = new_invs->inv; 1152 + arm_smmu_invs_for_each_cmp(invs, i, to_merge, j, cmp) { 1153 + if (cmp < 0) { 1154 + *new = invs->inv[i]; 1155 + } else if (cmp == 0) { 1156 + *new = invs->inv[i]; 1157 + WRITE_ONCE(new->users, READ_ONCE(new->users) + 1); 1158 + } else { 1159 + *new = to_merge->inv[j]; 1160 + WRITE_ONCE(new->users, 1); 1161 + } 1162 + 1163 + /* 1164 + * Check that the new array is sorted. This also validates that 1165 + * to_merge is sorted. 1166 + */ 1167 + if (new != new_invs->inv) 1168 + WARN_ON_ONCE(arm_smmu_inv_cmp(new - 1, new) == 1); 1169 + if (arm_smmu_inv_is_ats(new)) 1170 + new_invs->has_ats = true; 1171 + new++; 1172 + } 1173 + 1174 + WARN_ON(new != new_invs->inv + new_invs->num_invs); 1175 + 1176 + return new_invs; 1177 + } 1178 + EXPORT_SYMBOL_IF_KUNIT(arm_smmu_invs_merge); 1179 + 1180 + /** 1181 + * arm_smmu_invs_unref() - Find in @invs for all entries in @to_unref, decrease 1182 + * the user counts without deletions 1183 + * @invs: the base invalidation array 1184 + * @to_unref: an array of invalidations to decrease their user counts 1185 + * 1186 + * Return: the number of trash entries in the array, for arm_smmu_invs_purge() 1187 + * 1188 + * This function will not fail. Any entry with users=0 will be marked as trash, 1189 + * and caller will be notified about the trashed entry via @to_unref by setting 1190 + * a users=0. 1191 + * 1192 + * All tailing trash entries in the array will be dropped. And the size of the 1193 + * array will be trimmed properly. All trash entries in-between will remain in 1194 + * the @invs until being completely deleted by the next arm_smmu_invs_merge() 1195 + * or an arm_smmu_invs_purge() function call. 1196 + * 1197 + * This function must be locked and serialized with arm_smmu_invs_merge() and 1198 + * arm_smmu_invs_purge(), but do not lockdep on any mutex for KUNIT test. 1199 + * 1200 + * Note that the final @invs->num_invs might not reflect the actual number of 1201 + * invalidations due to trash entries. Any reader should take the read lock to 1202 + * iterate each entry and check its users counter till the last entry. 1203 + */ 1204 + VISIBLE_IF_KUNIT 1205 + void arm_smmu_invs_unref(struct arm_smmu_invs *invs, 1206 + struct arm_smmu_invs *to_unref) 1207 + { 1208 + unsigned long flags; 1209 + size_t num_invs = 0; 1210 + size_t i, j; 1211 + int cmp; 1212 + 1213 + arm_smmu_invs_for_each_cmp(invs, i, to_unref, j, cmp) { 1214 + if (cmp < 0) { 1215 + /* not found in to_unref, leave alone */ 1216 + num_invs = i + 1; 1217 + } else if (cmp == 0) { 1218 + int users = READ_ONCE(invs->inv[i].users) - 1; 1219 + 1220 + if (WARN_ON(users < 0)) 1221 + continue; 1222 + 1223 + /* same item */ 1224 + WRITE_ONCE(invs->inv[i].users, users); 1225 + if (users) { 1226 + WRITE_ONCE(to_unref->inv[j].users, 1); 1227 + num_invs = i + 1; 1228 + continue; 1229 + } 1230 + 1231 + /* Notify the caller about the trash entry */ 1232 + WRITE_ONCE(to_unref->inv[j].users, 0); 1233 + invs->num_trashes++; 1234 + } else { 1235 + /* item in to_unref is not in invs or already a trash */ 1236 + WARN_ON(true); 1237 + } 1238 + } 1239 + 1240 + /* Exclude any tailing trash */ 1241 + invs->num_trashes -= invs->num_invs - num_invs; 1242 + 1243 + /* The lock is required to fence concurrent ATS operations. */ 1244 + write_lock_irqsave(&invs->rwlock, flags); 1245 + WRITE_ONCE(invs->num_invs, num_invs); /* Remove tailing trash entries */ 1246 + write_unlock_irqrestore(&invs->rwlock, flags); 1247 + } 1248 + EXPORT_SYMBOL_IF_KUNIT(arm_smmu_invs_unref); 1249 + 1250 + /** 1251 + * arm_smmu_invs_purge() - Purge all the trash entries in the @invs 1252 + * @invs: the base invalidation array 1253 + * 1254 + * Return: a newly allocated array on success removing all the trash entries, or 1255 + * NULL if there is no trash entry in the array or if allocation failed 1256 + * 1257 + * This function must be locked and serialized with arm_smmu_invs_merge() and 1258 + * arm_smmu_invs_unref(), but do not lockdep on any lock for KUNIT test. 1259 + * 1260 + * Caller is responsible for freeing the @invs and the returned new one. 1261 + */ 1262 + VISIBLE_IF_KUNIT 1263 + struct arm_smmu_invs *arm_smmu_invs_purge(struct arm_smmu_invs *invs) 1264 + { 1265 + struct arm_smmu_invs *new_invs; 1266 + struct arm_smmu_inv *inv; 1267 + size_t i, num_invs = 0; 1268 + 1269 + if (WARN_ON(invs->num_invs < invs->num_trashes)) 1270 + return NULL; 1271 + if (!invs->num_invs || !invs->num_trashes) 1272 + return NULL; 1273 + 1274 + new_invs = arm_smmu_invs_alloc(invs->num_invs - invs->num_trashes); 1275 + if (!new_invs) 1276 + return NULL; 1277 + 1278 + arm_smmu_invs_for_each_entry(invs, i, inv) { 1279 + new_invs->inv[num_invs] = *inv; 1280 + if (arm_smmu_inv_is_ats(inv)) 1281 + new_invs->has_ats = true; 1282 + num_invs++; 1283 + } 1284 + 1285 + WARN_ON(num_invs != new_invs->num_invs); 1286 + return new_invs; 1287 + } 1288 + EXPORT_SYMBOL_IF_KUNIT(arm_smmu_invs_purge); 1289 + 1290 + /* Context descriptor manipulation functions */ 1042 1291 1043 1292 /* 1044 1293 * Based on the value of ent report which bits of the STE the HW will access. It ··· 1488 1235 { 1489 1236 __le64 unused_update[NUM_ENTRY_QWORDS]; 1490 1237 u8 used_qword_diff; 1238 + 1239 + /* 1240 + * Many of the entry structures have pointers to other structures that 1241 + * need to have their updates be visible before any writes of the entry 1242 + * happen. 1243 + */ 1244 + dma_wmb(); 1491 1245 1492 1246 used_qword_diff = 1493 1247 arm_smmu_entry_qword_diff(writer, entry, target, unused_update); ··· 2500 2240 return arm_smmu_cmdq_batch_submit(master->smmu, &cmds); 2501 2241 } 2502 2242 2503 - int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, 2504 - unsigned long iova, size_t size) 2505 - { 2506 - struct arm_smmu_master_domain *master_domain; 2507 - int i; 2508 - unsigned long flags; 2509 - struct arm_smmu_cmdq_ent cmd = { 2510 - .opcode = CMDQ_OP_ATC_INV, 2511 - }; 2512 - struct arm_smmu_cmdq_batch cmds; 2513 - 2514 - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS)) 2515 - return 0; 2516 - 2517 - /* 2518 - * Ensure that we've completed prior invalidation of the main TLBs 2519 - * before we read 'nr_ats_masters' in case of a concurrent call to 2520 - * arm_smmu_enable_ats(): 2521 - * 2522 - * // unmap() // arm_smmu_enable_ats() 2523 - * TLBI+SYNC atomic_inc(&nr_ats_masters); 2524 - * smp_mb(); [...] 2525 - * atomic_read(&nr_ats_masters); pci_enable_ats() // writel() 2526 - * 2527 - * Ensures that we always see the incremented 'nr_ats_masters' count if 2528 - * ATS was enabled at the PCI device before completion of the TLBI. 2529 - */ 2530 - smp_mb(); 2531 - if (!atomic_read(&smmu_domain->nr_ats_masters)) 2532 - return 0; 2533 - 2534 - arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd); 2535 - 2536 - spin_lock_irqsave(&smmu_domain->devices_lock, flags); 2537 - list_for_each_entry(master_domain, &smmu_domain->devices, 2538 - devices_elm) { 2539 - struct arm_smmu_master *master = master_domain->master; 2540 - 2541 - if (!master->ats_enabled) 2542 - continue; 2543 - 2544 - if (master_domain->nested_ats_flush) { 2545 - /* 2546 - * If a S2 used as a nesting parent is changed we have 2547 - * no option but to completely flush the ATC. 2548 - */ 2549 - arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); 2550 - } else { 2551 - arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, 2552 - &cmd); 2553 - } 2554 - 2555 - for (i = 0; i < master->num_streams; i++) { 2556 - cmd.atc.sid = master->streams[i].id; 2557 - arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd); 2558 - } 2559 - } 2560 - spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); 2561 - 2562 - return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds); 2563 - } 2564 - 2565 2243 /* IO_PGTABLE API */ 2566 2244 static void arm_smmu_tlb_inv_context(void *cookie) 2567 2245 { 2568 2246 struct arm_smmu_domain *smmu_domain = cookie; 2569 - struct arm_smmu_device *smmu = smmu_domain->smmu; 2570 - struct arm_smmu_cmdq_ent cmd; 2571 2247 2572 2248 /* 2573 - * NOTE: when io-pgtable is in non-strict mode, we may get here with 2574 - * PTEs previously cleared by unmaps on the current CPU not yet visible 2575 - * to the SMMU. We are relying on the dma_wmb() implicit during cmd 2576 - * insertion to guarantee those are observed before the TLBI. Do be 2577 - * careful, 007. 2249 + * If the DMA API is running in non-strict mode then another CPU could 2250 + * have changed the page table and not invoked any flush op. Instead the 2251 + * other CPU will do an atomic_read() and this CPU will have done an 2252 + * atomic_write(). That handshake is enough to acquire the page table 2253 + * writes from the other CPU. 2254 + * 2255 + * All command execution has a dma_wmb() to release all the in-memory 2256 + * structures written by this CPU, that barrier must also release the 2257 + * writes acquired from all the other CPUs too. 2258 + * 2259 + * There are other barriers and atomics on this path, but the above is 2260 + * the essential mechanism for ensuring that HW sees the page table 2261 + * writes from another CPU before it executes the IOTLB invalidation. 2578 2262 */ 2579 - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { 2580 - arm_smmu_tlb_inv_asid(smmu, smmu_domain->cd.asid); 2581 - } else { 2582 - cmd.opcode = CMDQ_OP_TLBI_S12_VMALL; 2583 - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; 2584 - arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); 2585 - } 2586 - arm_smmu_atc_inv_domain(smmu_domain, 0, 0); 2263 + arm_smmu_domain_inv(smmu_domain); 2587 2264 } 2588 2265 2589 - static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd, 2590 - unsigned long iova, size_t size, 2591 - size_t granule, 2592 - struct arm_smmu_domain *smmu_domain) 2266 + static void arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu, 2267 + struct arm_smmu_cmdq_batch *cmds, 2268 + struct arm_smmu_cmdq_ent *cmd, 2269 + unsigned long iova, size_t size, 2270 + size_t granule, size_t pgsize) 2593 2271 { 2594 - struct arm_smmu_device *smmu = smmu_domain->smmu; 2595 - unsigned long end = iova + size, num_pages = 0, tg = 0; 2272 + unsigned long end = iova + size, num_pages = 0, tg = pgsize; 2596 2273 size_t inv_range = granule; 2597 - struct arm_smmu_cmdq_batch cmds; 2598 2274 2599 - if (!size) 2275 + if (WARN_ON_ONCE(!size)) 2600 2276 return; 2601 2277 2602 2278 if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { 2603 - /* Get the leaf page size */ 2604 - tg = __ffs(smmu_domain->domain.pgsize_bitmap); 2605 - 2606 2279 num_pages = size >> tg; 2607 2280 2608 2281 /* Convert page size of 12,14,16 (log2) to 1,2,3 */ ··· 2554 2361 else if ((num_pages & CMDQ_TLBI_RANGE_NUM_MAX) == 1) 2555 2362 num_pages++; 2556 2363 } 2557 - 2558 - arm_smmu_cmdq_batch_init(smmu, &cmds, cmd); 2559 2364 2560 2365 while (iova < end) { 2561 2366 if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) { ··· 2582 2391 } 2583 2392 2584 2393 cmd->tlbi.addr = iova; 2585 - arm_smmu_cmdq_batch_add(smmu, &cmds, cmd); 2394 + arm_smmu_cmdq_batch_add(smmu, cmds, cmd); 2586 2395 iova += inv_range; 2587 2396 } 2588 - arm_smmu_cmdq_batch_submit(smmu, &cmds); 2589 2397 } 2590 2398 2591 - static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size, 2592 - size_t granule, bool leaf, 2593 - struct arm_smmu_domain *smmu_domain) 2399 + static bool arm_smmu_inv_size_too_big(struct arm_smmu_device *smmu, size_t size, 2400 + size_t granule) 2594 2401 { 2595 - struct arm_smmu_cmdq_ent cmd = { 2596 - .tlbi = { 2597 - .leaf = leaf, 2598 - }, 2599 - }; 2402 + size_t max_tlbi_ops; 2600 2403 2601 - if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { 2602 - cmd.opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? 2603 - CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA; 2604 - cmd.tlbi.asid = smmu_domain->cd.asid; 2605 - } else { 2606 - cmd.opcode = CMDQ_OP_TLBI_S2_IPA; 2607 - cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; 2608 - } 2609 - __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); 2404 + /* 0 size means invalidate all */ 2405 + if (!size || size == SIZE_MAX) 2406 + return true; 2610 2407 2611 - if (smmu_domain->nest_parent) { 2612 - /* 2613 - * When the S2 domain changes all the nested S1 ASIDs have to be 2614 - * flushed too. 2615 - */ 2616 - cmd.opcode = CMDQ_OP_TLBI_NH_ALL; 2617 - arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd); 2618 - } 2408 + if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) 2409 + return false; 2619 2410 2620 2411 /* 2621 - * Unfortunately, this can't be leaf-only since we may have 2622 - * zapped an entire table. 2412 + * Borrowed from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, 2413 + * this is used as a threshold to replace "size_opcode" commands with a 2414 + * single "nsize_opcode" command, when SMMU doesn't implement the range 2415 + * invalidation feature, where there can be too many per-granule TLBIs, 2416 + * resulting in a soft lockup. 2623 2417 */ 2624 - arm_smmu_atc_inv_domain(smmu_domain, iova, size); 2418 + max_tlbi_ops = 1 << (ilog2(granule) - 3); 2419 + return size >= max_tlbi_ops * granule; 2625 2420 } 2626 2421 2627 - void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, 2628 - size_t granule, bool leaf, 2629 - struct arm_smmu_domain *smmu_domain) 2422 + /* Used by non INV_TYPE_ATS* invalidations */ 2423 + static void arm_smmu_inv_to_cmdq_batch(struct arm_smmu_inv *inv, 2424 + struct arm_smmu_cmdq_batch *cmds, 2425 + struct arm_smmu_cmdq_ent *cmd, 2426 + unsigned long iova, size_t size, 2427 + unsigned int granule) 2630 2428 { 2631 - struct arm_smmu_cmdq_ent cmd = { 2632 - .opcode = smmu_domain->smmu->features & ARM_SMMU_FEAT_E2H ? 2633 - CMDQ_OP_TLBI_EL2_VA : CMDQ_OP_TLBI_NH_VA, 2634 - .tlbi = { 2635 - .asid = asid, 2636 - .leaf = leaf, 2637 - }, 2638 - }; 2429 + if (arm_smmu_inv_size_too_big(inv->smmu, size, granule)) { 2430 + cmd->opcode = inv->nsize_opcode; 2431 + arm_smmu_cmdq_batch_add(inv->smmu, cmds, cmd); 2432 + return; 2433 + } 2639 2434 2640 - __arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain); 2435 + cmd->opcode = inv->size_opcode; 2436 + arm_smmu_cmdq_batch_add_range(inv->smmu, cmds, cmd, iova, size, granule, 2437 + inv->pgsize); 2438 + } 2439 + 2440 + static inline bool arm_smmu_invs_end_batch(struct arm_smmu_inv *cur, 2441 + struct arm_smmu_inv *next) 2442 + { 2443 + /* Changing smmu means changing command queue */ 2444 + if (cur->smmu != next->smmu) 2445 + return true; 2446 + /* The batch for S2 TLBI must be done before nested S1 ASIDs */ 2447 + if (cur->type != INV_TYPE_S2_VMID_S1_CLEAR && 2448 + next->type == INV_TYPE_S2_VMID_S1_CLEAR) 2449 + return true; 2450 + /* ATS must be after a sync of the S1/S2 invalidations */ 2451 + if (!arm_smmu_inv_is_ats(cur) && arm_smmu_inv_is_ats(next)) 2452 + return true; 2453 + return false; 2454 + } 2455 + 2456 + static void __arm_smmu_domain_inv_range(struct arm_smmu_invs *invs, 2457 + unsigned long iova, size_t size, 2458 + unsigned int granule, bool leaf) 2459 + { 2460 + struct arm_smmu_cmdq_batch cmds = {}; 2461 + struct arm_smmu_inv *cur; 2462 + struct arm_smmu_inv *end; 2463 + 2464 + cur = invs->inv; 2465 + end = cur + READ_ONCE(invs->num_invs); 2466 + /* Skip any leading entry marked as a trash */ 2467 + for (; cur != end; cur++) 2468 + if (READ_ONCE(cur->users)) 2469 + break; 2470 + while (cur != end) { 2471 + struct arm_smmu_device *smmu = cur->smmu; 2472 + struct arm_smmu_cmdq_ent cmd = { 2473 + /* 2474 + * Pick size_opcode to run arm_smmu_get_cmdq(). This can 2475 + * be changed to nsize_opcode, which would result in the 2476 + * same CMDQ pointer. 2477 + */ 2478 + .opcode = cur->size_opcode, 2479 + }; 2480 + struct arm_smmu_inv *next; 2481 + 2482 + if (!cmds.num) 2483 + arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd); 2484 + 2485 + switch (cur->type) { 2486 + case INV_TYPE_S1_ASID: 2487 + cmd.tlbi.asid = cur->id; 2488 + cmd.tlbi.leaf = leaf; 2489 + arm_smmu_inv_to_cmdq_batch(cur, &cmds, &cmd, iova, size, 2490 + granule); 2491 + break; 2492 + case INV_TYPE_S2_VMID: 2493 + cmd.tlbi.vmid = cur->id; 2494 + cmd.tlbi.leaf = leaf; 2495 + arm_smmu_inv_to_cmdq_batch(cur, &cmds, &cmd, iova, size, 2496 + granule); 2497 + break; 2498 + case INV_TYPE_S2_VMID_S1_CLEAR: 2499 + /* CMDQ_OP_TLBI_S12_VMALL already flushed S1 entries */ 2500 + if (arm_smmu_inv_size_too_big(cur->smmu, size, granule)) 2501 + break; 2502 + cmd.tlbi.vmid = cur->id; 2503 + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); 2504 + break; 2505 + case INV_TYPE_ATS: 2506 + arm_smmu_atc_inv_to_cmd(cur->ssid, iova, size, &cmd); 2507 + cmd.atc.sid = cur->id; 2508 + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); 2509 + break; 2510 + case INV_TYPE_ATS_FULL: 2511 + arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd); 2512 + cmd.atc.sid = cur->id; 2513 + arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd); 2514 + break; 2515 + default: 2516 + WARN_ON_ONCE(1); 2517 + break; 2518 + } 2519 + 2520 + /* Skip any trash entry in-between */ 2521 + for (next = cur + 1; next != end; next++) 2522 + if (READ_ONCE(next->users)) 2523 + break; 2524 + 2525 + if (cmds.num && 2526 + (next == end || arm_smmu_invs_end_batch(cur, next))) { 2527 + arm_smmu_cmdq_batch_submit(smmu, &cmds); 2528 + cmds.num = 0; 2529 + } 2530 + cur = next; 2531 + } 2532 + } 2533 + 2534 + void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain, 2535 + unsigned long iova, size_t size, 2536 + unsigned int granule, bool leaf) 2537 + { 2538 + struct arm_smmu_invs *invs; 2539 + 2540 + /* 2541 + * An invalidation request must follow some IOPTE change and then load 2542 + * an invalidation array. In the meantime, a domain attachment mutates 2543 + * the array and then stores an STE/CD asking SMMU HW to acquire those 2544 + * changed IOPTEs. 2545 + * 2546 + * When running alone, a domain attachment relies on the dma_wmb() in 2547 + * arm_smmu_write_entry() used by arm_smmu_install_ste_for_dev(). 2548 + * 2549 + * But in a race, these two can be interdependent, making it a special 2550 + * case requiring an additional smp_mb() for the write->read ordering. 2551 + * Pairing with the dma_wmb() in arm_smmu_install_ste_for_dev(), this 2552 + * makes sure that IOPTE update prior to this point is visible to SMMU 2553 + * hardware before we load the updated invalidation array. 2554 + * 2555 + * [CPU0] | [CPU1] 2556 + * change IOPTE on new domain: | 2557 + * arm_smmu_domain_inv_range() { | arm_smmu_install_new_domain_invs() 2558 + * smp_mb(); // ensures IOPTE | arm_smmu_install_ste_for_dev { 2559 + * // seen by SMMU | dma_wmb(); // ensures invs update 2560 + * // load the updated invs | // before updating STE 2561 + * invs = rcu_dereference(); | STE = TTB0; 2562 + * ... | ... 2563 + * } | } 2564 + */ 2565 + smp_mb(); 2566 + 2567 + rcu_read_lock(); 2568 + invs = rcu_dereference(smmu_domain->invs); 2569 + 2570 + /* 2571 + * Avoid locking unless ATS is being used. No ATC invalidation can be 2572 + * going on after a domain is detached. 2573 + */ 2574 + if (invs->has_ats) { 2575 + unsigned long flags; 2576 + 2577 + read_lock_irqsave(&invs->rwlock, flags); 2578 + __arm_smmu_domain_inv_range(invs, iova, size, granule, leaf); 2579 + read_unlock_irqrestore(&invs->rwlock, flags); 2580 + } else { 2581 + __arm_smmu_domain_inv_range(invs, iova, size, granule, leaf); 2582 + } 2583 + 2584 + rcu_read_unlock(); 2641 2585 } 2642 2586 2643 2587 static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather, ··· 2788 2462 static void arm_smmu_tlb_inv_walk(unsigned long iova, size_t size, 2789 2463 size_t granule, void *cookie) 2790 2464 { 2791 - arm_smmu_tlb_inv_range_domain(iova, size, granule, false, cookie); 2465 + struct arm_smmu_domain *smmu_domain = cookie; 2466 + 2467 + arm_smmu_domain_inv_range(smmu_domain, iova, size, granule, false); 2792 2468 } 2793 2469 2794 2470 static const struct iommu_flush_ops arm_smmu_flush_ops = { ··· 2822 2494 return true; 2823 2495 case IOMMU_CAP_DIRTY_TRACKING: 2824 2496 return arm_smmu_dbm_capable(master->smmu); 2497 + case IOMMU_CAP_PCI_ATS_SUPPORTED: 2498 + return arm_smmu_ats_supported(master); 2825 2499 default: 2826 2500 return false; 2827 2501 } ··· 2852 2522 struct arm_smmu_domain *arm_smmu_domain_alloc(void) 2853 2523 { 2854 2524 struct arm_smmu_domain *smmu_domain; 2525 + struct arm_smmu_invs *new_invs; 2855 2526 2856 2527 smmu_domain = kzalloc_obj(*smmu_domain); 2857 2528 if (!smmu_domain) 2858 2529 return ERR_PTR(-ENOMEM); 2859 2530 2531 + new_invs = arm_smmu_invs_alloc(0); 2532 + if (!new_invs) { 2533 + kfree(smmu_domain); 2534 + return ERR_PTR(-ENOMEM); 2535 + } 2536 + 2860 2537 INIT_LIST_HEAD(&smmu_domain->devices); 2861 2538 spin_lock_init(&smmu_domain->devices_lock); 2539 + rcu_assign_pointer(smmu_domain->invs, new_invs); 2862 2540 2863 2541 return smmu_domain; 2864 2542 } ··· 2890 2552 ida_free(&smmu->vmid_map, cfg->vmid); 2891 2553 } 2892 2554 2893 - kfree(smmu_domain); 2555 + arm_smmu_domain_free(smmu_domain); 2894 2556 } 2895 2557 2896 2558 static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu, ··· 3208 2870 iopf_queue_remove_device(master->smmu->evtq.iopf, master->dev); 3209 2871 } 3210 2872 2873 + static struct arm_smmu_inv * 2874 + arm_smmu_master_build_inv(struct arm_smmu_master *master, 2875 + enum arm_smmu_inv_type type, u32 id, ioasid_t ssid, 2876 + size_t pgsize) 2877 + { 2878 + struct arm_smmu_invs *build_invs = master->build_invs; 2879 + struct arm_smmu_inv *cur, inv = { 2880 + .smmu = master->smmu, 2881 + .type = type, 2882 + .id = id, 2883 + .pgsize = pgsize, 2884 + }; 2885 + 2886 + if (WARN_ON(build_invs->num_invs >= build_invs->max_invs)) 2887 + return NULL; 2888 + cur = &build_invs->inv[build_invs->num_invs]; 2889 + build_invs->num_invs++; 2890 + 2891 + *cur = inv; 2892 + switch (type) { 2893 + case INV_TYPE_S1_ASID: 2894 + /* 2895 + * For S1 page tables the driver always uses VMID=0, and the 2896 + * invalidation logic for this type will set it as well. 2897 + */ 2898 + if (master->smmu->features & ARM_SMMU_FEAT_E2H) { 2899 + cur->size_opcode = CMDQ_OP_TLBI_EL2_VA; 2900 + cur->nsize_opcode = CMDQ_OP_TLBI_EL2_ASID; 2901 + } else { 2902 + cur->size_opcode = CMDQ_OP_TLBI_NH_VA; 2903 + cur->nsize_opcode = CMDQ_OP_TLBI_NH_ASID; 2904 + } 2905 + break; 2906 + case INV_TYPE_S2_VMID: 2907 + cur->size_opcode = CMDQ_OP_TLBI_S2_IPA; 2908 + cur->nsize_opcode = CMDQ_OP_TLBI_S12_VMALL; 2909 + break; 2910 + case INV_TYPE_S2_VMID_S1_CLEAR: 2911 + cur->size_opcode = cur->nsize_opcode = CMDQ_OP_TLBI_NH_ALL; 2912 + break; 2913 + case INV_TYPE_ATS: 2914 + case INV_TYPE_ATS_FULL: 2915 + cur->size_opcode = cur->nsize_opcode = CMDQ_OP_ATC_INV; 2916 + cur->ssid = ssid; 2917 + break; 2918 + } 2919 + 2920 + return cur; 2921 + } 2922 + 2923 + /* 2924 + * Use the preallocated scratch array at master->build_invs, to build a to_merge 2925 + * or to_unref array, to pass into a following arm_smmu_invs_merge/unref() call. 2926 + * 2927 + * Do not free the returned invs array. It is reused, and will be overwritten by 2928 + * the next arm_smmu_master_build_invs() call. 2929 + */ 2930 + static struct arm_smmu_invs * 2931 + arm_smmu_master_build_invs(struct arm_smmu_master *master, bool ats_enabled, 2932 + ioasid_t ssid, struct arm_smmu_domain *smmu_domain) 2933 + { 2934 + const bool nesting = smmu_domain->nest_parent; 2935 + size_t pgsize = 0, i; 2936 + 2937 + iommu_group_mutex_assert(master->dev); 2938 + 2939 + master->build_invs->num_invs = 0; 2940 + 2941 + /* Range-based invalidation requires the leaf pgsize for calculation */ 2942 + if (master->smmu->features & ARM_SMMU_FEAT_RANGE_INV) 2943 + pgsize = __ffs(smmu_domain->domain.pgsize_bitmap); 2944 + 2945 + switch (smmu_domain->stage) { 2946 + case ARM_SMMU_DOMAIN_SVA: 2947 + case ARM_SMMU_DOMAIN_S1: 2948 + if (!arm_smmu_master_build_inv(master, INV_TYPE_S1_ASID, 2949 + smmu_domain->cd.asid, 2950 + IOMMU_NO_PASID, pgsize)) 2951 + return NULL; 2952 + break; 2953 + case ARM_SMMU_DOMAIN_S2: 2954 + if (!arm_smmu_master_build_inv(master, INV_TYPE_S2_VMID, 2955 + smmu_domain->s2_cfg.vmid, 2956 + IOMMU_NO_PASID, pgsize)) 2957 + return NULL; 2958 + break; 2959 + default: 2960 + WARN_ON(true); 2961 + return NULL; 2962 + } 2963 + 2964 + /* All the nested S1 ASIDs have to be flushed when S2 parent changes */ 2965 + if (nesting) { 2966 + if (!arm_smmu_master_build_inv( 2967 + master, INV_TYPE_S2_VMID_S1_CLEAR, 2968 + smmu_domain->s2_cfg.vmid, IOMMU_NO_PASID, 0)) 2969 + return NULL; 2970 + } 2971 + 2972 + for (i = 0; ats_enabled && i < master->num_streams; i++) { 2973 + /* 2974 + * If an S2 used as a nesting parent is changed we have no 2975 + * option but to completely flush the ATC. 2976 + */ 2977 + if (!arm_smmu_master_build_inv( 2978 + master, nesting ? INV_TYPE_ATS_FULL : INV_TYPE_ATS, 2979 + master->streams[i].id, ssid, 0)) 2980 + return NULL; 2981 + } 2982 + 2983 + /* Note this build_invs must have been sorted */ 2984 + 2985 + return master->build_invs; 2986 + } 2987 + 3211 2988 static void arm_smmu_remove_master_domain(struct arm_smmu_master *master, 3212 2989 struct iommu_domain *domain, 3213 2990 ioasid_t ssid) ··· 3350 2897 3351 2898 arm_smmu_disable_iopf(master, master_domain); 3352 2899 kfree(master_domain); 2900 + } 2901 + 2902 + /* 2903 + * During attachment, the updates of the two domain->invs arrays are sequenced: 2904 + * 1. new domain updates its invs array, merging master->build_invs 2905 + * 2. new domain starts to include the master during its invalidation 2906 + * 3. master updates its STE switching from the old domain to the new domain 2907 + * 4. old domain still includes the master during its invalidation 2908 + * 5. old domain updates its invs array, unreferencing master->build_invs 2909 + * 2910 + * For 1 and 5, prepare the two updated arrays in advance, handling any changes 2911 + * that can possibly failure. So the actual update of either 1 or 5 won't fail. 2912 + * arm_smmu_asid_lock ensures that the old invs in the domains are intact while 2913 + * we are sequencing to update them. 2914 + */ 2915 + static int arm_smmu_attach_prepare_invs(struct arm_smmu_attach_state *state, 2916 + struct iommu_domain *new_domain) 2917 + { 2918 + struct arm_smmu_domain *old_smmu_domain = 2919 + to_smmu_domain_devices(state->old_domain); 2920 + struct arm_smmu_domain *new_smmu_domain = 2921 + to_smmu_domain_devices(new_domain); 2922 + struct arm_smmu_master *master = state->master; 2923 + ioasid_t ssid = state->ssid; 2924 + 2925 + /* 2926 + * At this point a NULL domain indicates the domain doesn't use the 2927 + * IOTLB, see to_smmu_domain_devices(). 2928 + */ 2929 + if (new_smmu_domain) { 2930 + struct arm_smmu_inv_state *invst = &state->new_domain_invst; 2931 + struct arm_smmu_invs *build_invs; 2932 + 2933 + invst->invs_ptr = &new_smmu_domain->invs; 2934 + invst->old_invs = rcu_dereference_protected( 2935 + new_smmu_domain->invs, 2936 + lockdep_is_held(&arm_smmu_asid_lock)); 2937 + build_invs = arm_smmu_master_build_invs( 2938 + master, state->ats_enabled, ssid, new_smmu_domain); 2939 + if (!build_invs) 2940 + return -EINVAL; 2941 + 2942 + invst->new_invs = 2943 + arm_smmu_invs_merge(invst->old_invs, build_invs); 2944 + if (IS_ERR(invst->new_invs)) 2945 + return PTR_ERR(invst->new_invs); 2946 + } 2947 + 2948 + if (old_smmu_domain) { 2949 + struct arm_smmu_inv_state *invst = &state->old_domain_invst; 2950 + 2951 + invst->invs_ptr = &old_smmu_domain->invs; 2952 + /* A re-attach case might have a different ats_enabled state */ 2953 + if (new_smmu_domain == old_smmu_domain) 2954 + invst->old_invs = state->new_domain_invst.new_invs; 2955 + else 2956 + invst->old_invs = rcu_dereference_protected( 2957 + old_smmu_domain->invs, 2958 + lockdep_is_held(&arm_smmu_asid_lock)); 2959 + /* For old_smmu_domain, new_invs points to master->build_invs */ 2960 + invst->new_invs = arm_smmu_master_build_invs( 2961 + master, master->ats_enabled, ssid, old_smmu_domain); 2962 + } 2963 + 2964 + return 0; 2965 + } 2966 + 2967 + /* Must be installed before arm_smmu_install_ste_for_dev() */ 2968 + static void 2969 + arm_smmu_install_new_domain_invs(struct arm_smmu_attach_state *state) 2970 + { 2971 + struct arm_smmu_inv_state *invst = &state->new_domain_invst; 2972 + 2973 + if (!invst->invs_ptr) 2974 + return; 2975 + 2976 + rcu_assign_pointer(*invst->invs_ptr, invst->new_invs); 2977 + kfree_rcu(invst->old_invs, rcu); 2978 + } 2979 + 2980 + static void arm_smmu_inv_flush_iotlb_tag(struct arm_smmu_inv *inv) 2981 + { 2982 + struct arm_smmu_cmdq_ent cmd = {}; 2983 + 2984 + switch (inv->type) { 2985 + case INV_TYPE_S1_ASID: 2986 + cmd.tlbi.asid = inv->id; 2987 + break; 2988 + case INV_TYPE_S2_VMID: 2989 + /* S2_VMID using nsize_opcode covers S2_VMID_S1_CLEAR */ 2990 + cmd.tlbi.vmid = inv->id; 2991 + break; 2992 + default: 2993 + return; 2994 + } 2995 + 2996 + cmd.opcode = inv->nsize_opcode; 2997 + arm_smmu_cmdq_issue_cmd_with_sync(inv->smmu, &cmd); 2998 + } 2999 + 3000 + /* Should be installed after arm_smmu_install_ste_for_dev() */ 3001 + static void 3002 + arm_smmu_install_old_domain_invs(struct arm_smmu_attach_state *state) 3003 + { 3004 + struct arm_smmu_inv_state *invst = &state->old_domain_invst; 3005 + struct arm_smmu_invs *old_invs = invst->old_invs; 3006 + struct arm_smmu_invs *new_invs; 3007 + 3008 + lockdep_assert_held(&arm_smmu_asid_lock); 3009 + 3010 + if (!invst->invs_ptr) 3011 + return; 3012 + 3013 + arm_smmu_invs_unref(old_invs, invst->new_invs); 3014 + /* 3015 + * When an IOTLB tag (the first entry in invs->new_invs) is no longer used, 3016 + * it means the ASID or VMID will no longer be invalidated by map/unmap and 3017 + * must be cleaned right now. The rule is that any ASID/VMID not in an invs 3018 + * array must be left cleared in the IOTLB. 3019 + */ 3020 + if (!READ_ONCE(invst->new_invs->inv[0].users)) 3021 + arm_smmu_inv_flush_iotlb_tag(&invst->new_invs->inv[0]); 3022 + 3023 + new_invs = arm_smmu_invs_purge(old_invs); 3024 + if (!new_invs) 3025 + return; 3026 + 3027 + rcu_assign_pointer(*invst->invs_ptr, new_invs); 3028 + kfree_rcu(old_invs, rcu); 3353 3029 } 3354 3030 3355 3031 /* ··· 3538 2956 arm_smmu_ats_supported(master); 3539 2957 } 3540 2958 2959 + ret = arm_smmu_attach_prepare_invs(state, new_domain); 2960 + if (ret) 2961 + return ret; 2962 + 3541 2963 if (smmu_domain) { 3542 2964 if (new_domain->type == IOMMU_DOMAIN_NESTED) { 3543 2965 ret = arm_smmu_attach_prepare_vmaster( 3544 2966 state, to_smmu_nested_domain(new_domain)); 3545 2967 if (ret) 3546 - return ret; 2968 + goto err_unprepare_invs; 3547 2969 } 3548 2970 3549 2971 master_domain = kzalloc_obj(*master_domain); ··· 3595 3009 atomic_inc(&smmu_domain->nr_ats_masters); 3596 3010 list_add(&master_domain->devices_elm, &smmu_domain->devices); 3597 3011 spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); 3012 + 3013 + arm_smmu_install_new_domain_invs(state); 3598 3014 } 3599 3015 3600 3016 if (!state->ats_enabled && master->ats_enabled) { ··· 3616 3028 kfree(master_domain); 3617 3029 err_free_vmaster: 3618 3030 kfree(state->vmaster); 3031 + err_unprepare_invs: 3032 + kfree(state->new_domain_invst.new_invs); 3619 3033 return ret; 3620 3034 } 3621 3035 ··· 3649 3059 } 3650 3060 3651 3061 arm_smmu_remove_master_domain(master, state->old_domain, state->ssid); 3062 + arm_smmu_install_old_domain_invs(state); 3652 3063 master->ats_enabled = state->ats_enabled; 3653 3064 } 3654 3065 ··· 3715 3124 state.ats_enabled); 3716 3125 arm_smmu_install_ste_for_dev(master, &target); 3717 3126 arm_smmu_clear_cd(master, IOMMU_NO_PASID); 3127 + break; 3128 + default: 3129 + WARN_ON(true); 3718 3130 break; 3719 3131 } 3720 3132 ··· 3832 3238 { 3833 3239 struct arm_smmu_domain *smmu_domain = to_smmu_domain(old_domain); 3834 3240 struct arm_smmu_master *master = dev_iommu_priv_get(dev); 3241 + struct arm_smmu_attach_state state = { 3242 + .master = master, 3243 + .old_domain = old_domain, 3244 + .ssid = pasid, 3245 + }; 3835 3246 3836 3247 mutex_lock(&arm_smmu_asid_lock); 3248 + arm_smmu_attach_prepare_invs(&state, NULL); 3837 3249 arm_smmu_clear_cd(master, pasid); 3838 3250 if (master->ats_enabled) 3839 3251 arm_smmu_atc_inv_master(master, pasid); 3840 3252 arm_smmu_remove_master_domain(master, &smmu_domain->domain, pasid); 3253 + arm_smmu_install_old_domain_invs(&state); 3841 3254 mutex_unlock(&arm_smmu_asid_lock); 3842 3255 3843 3256 /* ··· 4018 3417 return &smmu_domain->domain; 4019 3418 4020 3419 err_free: 4021 - kfree(smmu_domain); 3420 + arm_smmu_domain_free(smmu_domain); 4022 3421 return ERR_PTR(ret); 4023 3422 } 4024 3423 ··· 4063 3462 if (!gather->pgsize) 4064 3463 return; 4065 3464 4066 - arm_smmu_tlb_inv_range_domain(gather->start, 4067 - gather->end - gather->start + 1, 4068 - gather->pgsize, true, smmu_domain); 3465 + arm_smmu_domain_inv_range(smmu_domain, gather->start, 3466 + gather->end - gather->start + 1, 3467 + gather->pgsize, true); 4069 3468 } 4070 3469 4071 3470 static phys_addr_t ··· 4110 3509 return 0; 4111 3510 } 4112 3511 3512 + static int arm_smmu_stream_id_cmp(const void *_l, const void *_r) 3513 + { 3514 + const typeof_member(struct arm_smmu_stream, id) *l = _l; 3515 + const typeof_member(struct arm_smmu_stream, id) *r = _r; 3516 + 3517 + return cmp_int(*l, *r); 3518 + } 3519 + 4113 3520 static int arm_smmu_insert_master(struct arm_smmu_device *smmu, 4114 3521 struct arm_smmu_master *master) 4115 3522 { 4116 3523 int i; 4117 3524 int ret = 0; 4118 3525 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev); 3526 + bool ats_supported = dev_is_pci(master->dev) && 3527 + pci_ats_supported(to_pci_dev(master->dev)); 4119 3528 4120 3529 master->streams = kzalloc_objs(*master->streams, fwspec->num_ids); 4121 3530 if (!master->streams) 4122 3531 return -ENOMEM; 4123 3532 master->num_streams = fwspec->num_ids; 4124 3533 3534 + if (!ats_supported) { 3535 + /* Base case has 1 ASID entry or maximum 2 VMID entries */ 3536 + master->build_invs = arm_smmu_invs_alloc(2); 3537 + } else { 3538 + /* ATS case adds num_ids of entries, on top of the base case */ 3539 + master->build_invs = arm_smmu_invs_alloc(2 + fwspec->num_ids); 3540 + } 3541 + if (!master->build_invs) { 3542 + kfree(master->streams); 3543 + return -ENOMEM; 3544 + } 3545 + 3546 + for (i = 0; i < fwspec->num_ids; i++) { 3547 + struct arm_smmu_stream *new_stream = &master->streams[i]; 3548 + 3549 + new_stream->id = fwspec->ids[i]; 3550 + new_stream->master = master; 3551 + } 3552 + 3553 + /* Put the ids into order for sorted to_merge/to_unref arrays */ 3554 + sort_nonatomic(master->streams, master->num_streams, 3555 + sizeof(master->streams[0]), arm_smmu_stream_id_cmp, 3556 + NULL); 3557 + 4125 3558 mutex_lock(&smmu->streams_mutex); 4126 3559 for (i = 0; i < fwspec->num_ids; i++) { 4127 3560 struct arm_smmu_stream *new_stream = &master->streams[i]; 4128 3561 struct rb_node *existing; 4129 - u32 sid = fwspec->ids[i]; 4130 - 4131 - new_stream->id = sid; 4132 - new_stream->master = master; 3562 + u32 sid = new_stream->id; 4133 3563 4134 3564 ret = arm_smmu_init_sid_strtab(smmu, sid); 4135 3565 if (ret) ··· 4190 3558 for (i--; i >= 0; i--) 4191 3559 rb_erase(&master->streams[i].node, &smmu->streams); 4192 3560 kfree(master->streams); 3561 + kfree(master->build_invs); 4193 3562 } 4194 3563 mutex_unlock(&smmu->streams_mutex); 4195 3564 ··· 4212 3579 mutex_unlock(&smmu->streams_mutex); 4213 3580 4214 3581 kfree(master->streams); 3582 + kfree(master->build_invs); 4215 3583 } 4216 3584 4217 3585 static struct iommu_device *arm_smmu_probe_device(struct device *dev) ··· 4942 4308 #define IIDR_IMPLEMENTER_ARM 0x43b 4943 4309 #define IIDR_PRODUCTID_ARM_MMU_600 0x483 4944 4310 #define IIDR_PRODUCTID_ARM_MMU_700 0x487 4311 + #define IIDR_PRODUCTID_ARM_MMU_L1 0x48a 4312 + #define IIDR_PRODUCTID_ARM_MMU_S3 0x498 4945 4313 4946 4314 static void arm_smmu_device_iidr_probe(struct arm_smmu_device *smmu) 4947 4315 { ··· 4968 4332 smmu->features &= ~ARM_SMMU_FEAT_NESTING; 4969 4333 break; 4970 4334 case IIDR_PRODUCTID_ARM_MMU_700: 4971 - /* Arm erratum 2812531 */ 4335 + /* Many errata... */ 4972 4336 smmu->features &= ~ARM_SMMU_FEAT_BTM; 4973 - smmu->options |= ARM_SMMU_OPT_CMDQ_FORCE_SYNC; 4974 - /* Arm errata 2268618, 2812531 */ 4975 - smmu->features &= ~ARM_SMMU_FEAT_NESTING; 4337 + if (variant < 1 || revision < 1) { 4338 + /* Arm erratum 2812531 */ 4339 + smmu->options |= ARM_SMMU_OPT_CMDQ_FORCE_SYNC; 4340 + /* Arm errata 2268618, 2812531 */ 4341 + smmu->features &= ~ARM_SMMU_FEAT_NESTING; 4342 + } 4343 + break; 4344 + case IIDR_PRODUCTID_ARM_MMU_L1: 4345 + case IIDR_PRODUCTID_ARM_MMU_S3: 4346 + /* Arm errata 3878312/3995052 */ 4347 + smmu->features &= ~ARM_SMMU_FEAT_BTM; 4976 4348 break; 4977 4349 } 4978 4350 break;

+136 -6

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h

··· 648 648 int num; 649 649 }; 650 650 651 + /* 652 + * The order here also determines the sequence in which commands are sent to the 653 + * command queue. E.g. TLBI must be done before ATC_INV. 654 + */ 655 + enum arm_smmu_inv_type { 656 + INV_TYPE_S1_ASID, 657 + INV_TYPE_S2_VMID, 658 + INV_TYPE_S2_VMID_S1_CLEAR, 659 + INV_TYPE_ATS, 660 + INV_TYPE_ATS_FULL, 661 + }; 662 + 663 + struct arm_smmu_inv { 664 + struct arm_smmu_device *smmu; 665 + u8 type; 666 + u8 size_opcode; 667 + u8 nsize_opcode; 668 + u32 id; /* ASID or VMID or SID */ 669 + union { 670 + size_t pgsize; /* ARM_SMMU_FEAT_RANGE_INV */ 671 + u32 ssid; /* INV_TYPE_ATS */ 672 + }; 673 + 674 + int users; /* users=0 to mark as a trash to be purged */ 675 + }; 676 + 677 + static inline bool arm_smmu_inv_is_ats(const struct arm_smmu_inv *inv) 678 + { 679 + return inv->type == INV_TYPE_ATS || inv->type == INV_TYPE_ATS_FULL; 680 + } 681 + 682 + /** 683 + * struct arm_smmu_invs - Per-domain invalidation array 684 + * @max_invs: maximum capacity of the flexible array 685 + * @num_invs: number of invalidations in the flexible array. May be smaller than 686 + * @max_invs after a tailing trash entry is excluded, but must not be 687 + * greater than @max_invs 688 + * @num_trashes: number of trash entries in the array for arm_smmu_invs_purge(). 689 + * Must not be greater than @num_invs 690 + * @rwlock: optional rwlock to fence ATS operations 691 + * @has_ats: flag if the array contains an INV_TYPE_ATS or INV_TYPE_ATS_FULL 692 + * @rcu: rcu head for kfree_rcu() 693 + * @inv: flexible invalidation array 694 + * 695 + * The arm_smmu_invs is an RCU data structure. During a ->attach_dev callback, 696 + * arm_smmu_invs_merge(), arm_smmu_invs_unref() and arm_smmu_invs_purge() will 697 + * be used to allocate a new copy of an old array for addition and deletion in 698 + * the old domain's and new domain's invs arrays. 699 + * 700 + * The arm_smmu_invs_unref() mutates a given array, by internally reducing the 701 + * users counts of some given entries. This exists to support a no-fail routine 702 + * like attaching to an IOMMU_DOMAIN_BLOCKED. And it could pair with a followup 703 + * arm_smmu_invs_purge() call to generate a new clean array. 704 + * 705 + * Concurrent invalidation thread will push every invalidation described in the 706 + * array into the command queue for each invalidation event. It is designed like 707 + * this to optimize the invalidation fast path by avoiding locks. 708 + * 709 + * A domain can be shared across SMMU instances. When an instance gets removed, 710 + * it would delete all the entries that belong to that SMMU instance. Then, a 711 + * synchronize_rcu() would have to be called to sync the array, to prevent any 712 + * concurrent invalidation thread accessing the old array from issuing commands 713 + * to the command queue of a removed SMMU instance. 714 + */ 715 + struct arm_smmu_invs { 716 + size_t max_invs; 717 + size_t num_invs; 718 + size_t num_trashes; 719 + rwlock_t rwlock; 720 + bool has_ats; 721 + struct rcu_head rcu; 722 + struct arm_smmu_inv inv[] __counted_by(max_invs); 723 + }; 724 + 725 + static inline struct arm_smmu_invs *arm_smmu_invs_alloc(size_t num_invs) 726 + { 727 + struct arm_smmu_invs *new_invs; 728 + 729 + new_invs = kzalloc(struct_size(new_invs, inv, num_invs), GFP_KERNEL); 730 + if (!new_invs) 731 + return NULL; 732 + new_invs->max_invs = num_invs; 733 + new_invs->num_invs = num_invs; 734 + rwlock_init(&new_invs->rwlock); 735 + return new_invs; 736 + } 737 + 651 738 struct arm_smmu_evtq { 652 739 struct arm_smmu_queue q; 653 740 struct iopf_queue *iopf; ··· 928 841 struct arm_smmu_device *smmu; 929 842 struct device *dev; 930 843 struct arm_smmu_stream *streams; 844 + /* 845 + * Scratch memory for a to_merge or to_unref array to build a per-domain 846 + * invalidation array. It'll be pre-allocated with enough enries for all 847 + * possible build scenarios. It can be used by only one caller at a time 848 + * until the arm_smmu_invs_merge/unref() finishes. Must be locked by the 849 + * iommu_group mutex. 850 + */ 851 + struct arm_smmu_invs *build_invs; 931 852 struct arm_smmu_vmaster *vmaster; /* use smmu->streams_mutex */ 932 853 /* Locked by the iommu core using the group mutex */ 933 854 struct arm_smmu_ctx_desc_cfg cd_table; ··· 951 856 enum arm_smmu_domain_stage { 952 857 ARM_SMMU_DOMAIN_S1 = 0, 953 858 ARM_SMMU_DOMAIN_S2, 859 + ARM_SMMU_DOMAIN_SVA, 954 860 }; 955 861 956 862 struct arm_smmu_domain { ··· 967 871 }; 968 872 969 873 struct iommu_domain domain; 874 + 875 + struct arm_smmu_invs __rcu *invs; 970 876 971 877 /* List of struct arm_smmu_master_domain */ 972 878 struct list_head devices; ··· 1022 924 void arm_smmu_make_sva_cd(struct arm_smmu_cd *target, 1023 925 struct arm_smmu_master *master, struct mm_struct *mm, 1024 926 u16 asid); 927 + 928 + struct arm_smmu_invs *arm_smmu_invs_merge(struct arm_smmu_invs *invs, 929 + struct arm_smmu_invs *to_merge); 930 + void arm_smmu_invs_unref(struct arm_smmu_invs *invs, 931 + struct arm_smmu_invs *to_unref); 932 + struct arm_smmu_invs *arm_smmu_invs_purge(struct arm_smmu_invs *invs); 1025 933 #endif 1026 934 1027 935 struct arm_smmu_master_domain { ··· 1059 955 1060 956 struct arm_smmu_domain *arm_smmu_domain_alloc(void); 1061 957 958 + static inline void arm_smmu_domain_free(struct arm_smmu_domain *smmu_domain) 959 + { 960 + /* No concurrency with invalidation is possible at this point */ 961 + kfree(rcu_dereference_protected(smmu_domain->invs, true)); 962 + kfree(smmu_domain); 963 + } 964 + 1062 965 void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid); 1063 966 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, 1064 967 u32 ssid); ··· 1080 969 struct arm_smmu_domain *smmu_domain, ioasid_t pasid, 1081 970 struct arm_smmu_cd *cd, struct iommu_domain *old); 1082 971 1083 - void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid); 1084 - void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid, 1085 - size_t granule, bool leaf, 1086 - struct arm_smmu_domain *smmu_domain); 1087 - int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, 1088 - unsigned long iova, size_t size); 972 + void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain, 973 + unsigned long iova, size_t size, 974 + unsigned int granule, bool leaf); 975 + 976 + static inline void arm_smmu_domain_inv(struct arm_smmu_domain *smmu_domain) 977 + { 978 + arm_smmu_domain_inv_range(smmu_domain, 0, 0, 0, false); 979 + } 1089 980 1090 981 void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu, 1091 982 struct arm_smmu_cmdq *cmdq); ··· 1104 991 IOMMU_FWSPEC_PCI_RC_CANWBS; 1105 992 } 1106 993 994 + /** 995 + * struct arm_smmu_inv_state - Per-domain invalidation array state 996 + * @invs_ptr: points to the domain->invs (unwinding nesting/etc.) or is NULL if 997 + * no change should be made 998 + * @old_invs: the original invs array 999 + * @new_invs: for new domain, this is the new invs array to update domain->invs; 1000 + * for old domain, this is the master->build_invs to pass in as the 1001 + * to_unref argument to an arm_smmu_invs_unref() call 1002 + */ 1003 + struct arm_smmu_inv_state { 1004 + struct arm_smmu_invs __rcu **invs_ptr; 1005 + struct arm_smmu_invs *old_invs; 1006 + struct arm_smmu_invs *new_invs; 1007 + }; 1008 + 1107 1009 struct arm_smmu_attach_state { 1108 1010 /* Inputs */ 1109 1011 struct iommu_domain *old_domain; ··· 1128 1000 ioasid_t ssid; 1129 1001 /* Resulting state */ 1130 1002 struct arm_smmu_vmaster *vmaster; 1003 + struct arm_smmu_inv_state old_domain_invst; 1004 + struct arm_smmu_inv_state new_domain_invst; 1131 1005 bool ats_enabled; 1132 1006 }; 1133 1007

+4 -3

drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c

··· 479 479 /* Reset VCMDQ */ 480 480 tegra241_vcmdq_hw_deinit(vcmdq); 481 481 482 + /* vintf->hyp_own is a HW state finalized in tegra241_vintf_hw_init() */ 483 + if (!vcmdq->vintf->hyp_own) 484 + vcmdq->cmdq.supports_cmd = tegra241_guest_vcmdq_supports_cmd; 485 + 482 486 /* Configure and enable VCMDQ */ 483 487 writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE)); 484 488 ··· 642 638 /* ...override q_base to write VCMDQ_BASE registers */ 643 639 q->q_base = q->base_dma & VCMDQ_ADDR; 644 640 q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift); 645 - 646 - if (!vcmdq->vintf->hyp_own) 647 - cmdq->supports_cmd = tegra241_guest_vcmdq_supports_cmd; 648 641 649 642 return arm_smmu_cmdq_init(smmu, cmdq); 650 643 }

+12 -1

drivers/iommu/dma-iommu.c

··· 14 14 #include <linux/device.h> 15 15 #include <linux/dma-direct.h> 16 16 #include <linux/dma-map-ops.h> 17 + #include <linux/generic_pt/iommu.h> 17 18 #include <linux/gfp.h> 18 19 #include <linux/huge_mm.h> 19 20 #include <linux/iommu.h> ··· 649 648 } 650 649 } 651 650 651 + static bool iommu_domain_supports_fq(struct device *dev, 652 + struct iommu_domain *domain) 653 + { 654 + /* iommupt always supports DMA-FQ */ 655 + if (iommupt_from_domain(domain)) 656 + return true; 657 + return device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH); 658 + } 659 + 652 660 /** 653 661 * iommu_dma_init_domain - Initialise a DMA mapping domain 654 662 * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() ··· 716 706 717 707 /* If the FQ fails we can simply fall back to strict mode */ 718 708 if (domain->type == IOMMU_DOMAIN_DMA_FQ && 719 - (!device_iommu_capable(dev, IOMMU_CAP_DEFERRED_FLUSH) || iommu_dma_init_fq(domain))) 709 + (!iommu_domain_supports_fq(dev, domain) || 710 + iommu_dma_init_fq(domain))) 720 711 domain->type = IOMMU_DOMAIN_DMA; 721 712 722 713 return iova_reserve_iommu_regions(dev, domain);

+1

drivers/iommu/generic_pt/.kunitconfig

··· 5 5 CONFIG_IOMMU_PT=y 6 6 CONFIG_IOMMU_PT_AMDV1=y 7 7 CONFIG_IOMMU_PT_VTDSS=y 8 + CONFIG_IOMMU_PT_RISCV64=y 8 9 CONFIG_IOMMU_PT_X86_64=y 9 10 CONFIG_IOMMU_PT_KUNIT_TEST=y 10 11

+11

drivers/iommu/generic_pt/Kconfig

··· 52 52 53 53 Selected automatically by an IOMMU driver that uses this format. 54 54 55 + config IOMMU_PT_RISCV64 56 + tristate "IOMMU page table for RISC-V 64 bit Sv57/Sv48/Sv39" 57 + depends on !GENERIC_ATOMIC64 # for cmpxchg64 58 + help 59 + iommu_domain implementation for RISC-V 64 bit 3/4/5 level page table. 60 + It supports 4K/2M/1G/512G/256T page sizes and can decode a sign 61 + extended portion of the 64 bit IOVA space. 62 + 63 + Selected automatically by an IOMMU driver that uses this format. 64 + 55 65 config IOMMU_PT_X86_64 56 66 tristate "IOMMU page table for x86 64-bit, 4/5 levels" 57 67 depends on !GENERIC_ATOMIC64 # for cmpxchg64 ··· 76 66 tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS 77 67 depends on KUNIT 78 68 depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 69 + depends on IOMMU_PT_RISCV64 || !IOMMU_PT_RISCV64 79 70 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 80 71 depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS 81 72 default KUNIT_ALL_TESTS

+2

drivers/iommu/generic_pt/fmt/Makefile

··· 5 5 6 6 iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss 7 7 8 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_RISCV64) += riscv64 9 + 8 10 iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 9 11 10 12 IOMMU_PT_KUNIT_TEST :=

+29

drivers/iommu/generic_pt/fmt/defs_riscv.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + */ 6 + #ifndef __GENERIC_PT_FMT_DEFS_RISCV_H 7 + #define __GENERIC_PT_FMT_DEFS_RISCV_H 8 + 9 + #include <linux/generic_pt/common.h> 10 + #include <linux/types.h> 11 + 12 + #ifdef PT_RISCV_32BIT 13 + typedef u32 pt_riscv_entry_t; 14 + #define riscvpt_write_attrs riscv32pt_write_attrs 15 + #else 16 + typedef u64 pt_riscv_entry_t; 17 + #define riscvpt_write_attrs riscv64pt_write_attrs 18 + #endif 19 + 20 + typedef pt_riscv_entry_t pt_vaddr_t; 21 + typedef u64 pt_oaddr_t; 22 + 23 + struct riscvpt_write_attrs { 24 + pt_riscv_entry_t descriptor_bits; 25 + gfp_t gfp; 26 + }; 27 + #define pt_write_attrs riscvpt_write_attrs 28 + 29 + #endif

+11

drivers/iommu/generic_pt/fmt/iommu_riscv64.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define PT_FMT riscv 6 + #define PT_FMT_VARIANT 64 7 + #define PT_SUPPORTED_FEATURES \ 8 + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ 9 + BIT(PT_FEAT_RISCV_SVNAPOT_64K)) 10 + 11 + #include "iommu_template.h"

+313

drivers/iommu/generic_pt/fmt/riscv.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * RISC-V page table 6 + * 7 + * This is described in Sections: 8 + * 12.3. Sv32: Page-Based 32-bit Virtual-Memory Systems 9 + * 12.4. Sv39: Page-Based 39-bit Virtual-Memory System 10 + * 12.5. Sv48: Page-Based 48-bit Virtual-Memory System 11 + * 12.6. Sv57: Page-Based 57-bit Virtual-Memory System 12 + * of the "The RISC-V Instruction Set Manual: Volume II" 13 + * 14 + * This includes the contiguous page extension from: 15 + * Chapter 13. "Svnapot" Extension for NAPOT Translation Contiguity, 16 + * Version 1.0 17 + * 18 + * The table format is sign extended and supports leafs in every level. The spec 19 + * doesn't talk a lot about levels, but level here is the same as i=LEVELS-1 in 20 + * the spec. 21 + */ 22 + #ifndef __GENERIC_PT_FMT_RISCV_H 23 + #define __GENERIC_PT_FMT_RISCV_H 24 + 25 + #include "defs_riscv.h" 26 + #include "../pt_defs.h" 27 + 28 + #include <linux/bitfield.h> 29 + #include <linux/container_of.h> 30 + #include <linux/log2.h> 31 + #include <linux/sizes.h> 32 + 33 + enum { 34 + PT_ITEM_WORD_SIZE = sizeof(pt_riscv_entry_t), 35 + #ifdef PT_RISCV_32BIT 36 + PT_MAX_VA_ADDRESS_LG2 = 32, 37 + PT_MAX_OUTPUT_ADDRESS_LG2 = 34, 38 + PT_MAX_TOP_LEVEL = 1, 39 + #else 40 + PT_MAX_VA_ADDRESS_LG2 = 57, 41 + PT_MAX_OUTPUT_ADDRESS_LG2 = 56, 42 + PT_MAX_TOP_LEVEL = 4, 43 + #endif 44 + PT_GRANULE_LG2SZ = 12, 45 + PT_TABLEMEM_LG2SZ = 12, 46 + 47 + /* fsc.PPN is 44 bits wide, all PPNs are 4k aligned */ 48 + PT_TOP_PHYS_MASK = GENMASK_ULL(55, 12), 49 + }; 50 + 51 + /* PTE bits */ 52 + enum { 53 + RISCVPT_V = BIT(0), 54 + RISCVPT_R = BIT(1), 55 + RISCVPT_W = BIT(2), 56 + RISCVPT_X = BIT(3), 57 + RISCVPT_U = BIT(4), 58 + RISCVPT_G = BIT(5), 59 + RISCVPT_A = BIT(6), 60 + RISCVPT_D = BIT(7), 61 + RISCVPT_RSW = GENMASK(9, 8), 62 + RISCVPT_PPN32 = GENMASK(31, 10), 63 + 64 + RISCVPT_PPN64 = GENMASK_ULL(53, 10), 65 + RISCVPT_PPN64_64K = GENMASK_ULL(53, 14), 66 + RISCVPT_PBMT = GENMASK_ULL(62, 61), 67 + RISCVPT_N = BIT_ULL(63), 68 + 69 + /* Svnapot encodings for ppn[0] */ 70 + RISCVPT_PPN64_64K_SZ = BIT(13), 71 + }; 72 + 73 + #ifdef PT_RISCV_32BIT 74 + #define RISCVPT_PPN RISCVPT_PPN32 75 + #define pt_riscv pt_riscv_32 76 + #else 77 + #define RISCVPT_PPN RISCVPT_PPN64 78 + #define pt_riscv pt_riscv_64 79 + #endif 80 + 81 + #define common_to_riscvpt(common_ptr) \ 82 + container_of_const(common_ptr, struct pt_riscv, common) 83 + #define to_riscvpt(pts) common_to_riscvpt((pts)->range->common) 84 + 85 + static inline pt_oaddr_t riscvpt_table_pa(const struct pt_state *pts) 86 + { 87 + return oalog2_mul(FIELD_GET(RISCVPT_PPN, pts->entry), PT_GRANULE_LG2SZ); 88 + } 89 + #define pt_table_pa riscvpt_table_pa 90 + 91 + static inline pt_oaddr_t riscvpt_entry_oa(const struct pt_state *pts) 92 + { 93 + if (pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K) && 94 + pts->entry & RISCVPT_N) { 95 + PT_WARN_ON(pts->level != 0); 96 + return oalog2_mul(FIELD_GET(RISCVPT_PPN64_64K, pts->entry), 97 + ilog2(SZ_64K)); 98 + } 99 + return oalog2_mul(FIELD_GET(RISCVPT_PPN, pts->entry), PT_GRANULE_LG2SZ); 100 + } 101 + #define pt_entry_oa riscvpt_entry_oa 102 + 103 + static inline bool riscvpt_can_have_leaf(const struct pt_state *pts) 104 + { 105 + return true; 106 + } 107 + #define pt_can_have_leaf riscvpt_can_have_leaf 108 + 109 + /* Body in pt_fmt_defaults.h */ 110 + static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); 111 + 112 + static inline unsigned int 113 + riscvpt_entry_num_contig_lg2(const struct pt_state *pts) 114 + { 115 + if (PT_SUPPORTED_FEATURE(PT_FEAT_RISCV_SVNAPOT_64K) && 116 + pts->entry & RISCVPT_N) { 117 + PT_WARN_ON(!pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K)); 118 + PT_WARN_ON(pts->level); 119 + return ilog2(16); 120 + } 121 + return ilog2(1); 122 + } 123 + #define pt_entry_num_contig_lg2 riscvpt_entry_num_contig_lg2 124 + 125 + static inline unsigned int riscvpt_num_items_lg2(const struct pt_state *pts) 126 + { 127 + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 128 + } 129 + #define pt_num_items_lg2 riscvpt_num_items_lg2 130 + 131 + static inline unsigned short 132 + riscvpt_contig_count_lg2(const struct pt_state *pts) 133 + { 134 + if (pts->level == 0 && pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K)) 135 + return ilog2(16); 136 + return ilog2(1); 137 + } 138 + #define pt_contig_count_lg2 riscvpt_contig_count_lg2 139 + 140 + static inline enum pt_entry_type riscvpt_load_entry_raw(struct pt_state *pts) 141 + { 142 + const pt_riscv_entry_t *tablep = pt_cur_table(pts, pt_riscv_entry_t); 143 + pt_riscv_entry_t entry; 144 + 145 + pts->entry = entry = READ_ONCE(tablep[pts->index]); 146 + if (!(entry & RISCVPT_V)) 147 + return PT_ENTRY_EMPTY; 148 + if (pts->level == 0 || 149 + ((entry & (RISCVPT_X | RISCVPT_W | RISCVPT_R)) != 0)) 150 + return PT_ENTRY_OA; 151 + return PT_ENTRY_TABLE; 152 + } 153 + #define pt_load_entry_raw riscvpt_load_entry_raw 154 + 155 + static inline void 156 + riscvpt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 157 + unsigned int oasz_lg2, 158 + const struct pt_write_attrs *attrs) 159 + { 160 + pt_riscv_entry_t *tablep = pt_cur_table(pts, pt_riscv_entry_t); 161 + pt_riscv_entry_t entry; 162 + 163 + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 164 + return; 165 + 166 + entry = RISCVPT_V | 167 + FIELD_PREP(RISCVPT_PPN, log2_div(oa, PT_GRANULE_LG2SZ)) | 168 + attrs->descriptor_bits; 169 + 170 + if (pts_feature(pts, PT_FEAT_RISCV_SVNAPOT_64K) && pts->level == 0 && 171 + oasz_lg2 != PT_GRANULE_LG2SZ) { 172 + u64 *end; 173 + 174 + entry |= RISCVPT_N | RISCVPT_PPN64_64K_SZ; 175 + tablep += pts->index; 176 + end = tablep + log2_div(SZ_64K, PT_GRANULE_LG2SZ); 177 + for (; tablep != end; tablep++) 178 + WRITE_ONCE(*tablep, entry); 179 + } else { 180 + /* FIXME does riscv need this to be cmpxchg? */ 181 + WRITE_ONCE(tablep[pts->index], entry); 182 + } 183 + pts->entry = entry; 184 + } 185 + #define pt_install_leaf_entry riscvpt_install_leaf_entry 186 + 187 + static inline bool riscvpt_install_table(struct pt_state *pts, 188 + pt_oaddr_t table_pa, 189 + const struct pt_write_attrs *attrs) 190 + { 191 + pt_riscv_entry_t entry; 192 + 193 + entry = RISCVPT_V | 194 + FIELD_PREP(RISCVPT_PPN, log2_div(table_pa, PT_GRANULE_LG2SZ)); 195 + return pt_table_install64(pts, entry); 196 + } 197 + #define pt_install_table riscvpt_install_table 198 + 199 + static inline void riscvpt_attr_from_entry(const struct pt_state *pts, 200 + struct pt_write_attrs *attrs) 201 + { 202 + attrs->descriptor_bits = 203 + pts->entry & (RISCVPT_R | RISCVPT_W | RISCVPT_X | RISCVPT_U | 204 + RISCVPT_G | RISCVPT_A | RISCVPT_D); 205 + } 206 + #define pt_attr_from_entry riscvpt_attr_from_entry 207 + 208 + /* --- iommu */ 209 + #include <linux/generic_pt/iommu.h> 210 + #include <linux/iommu.h> 211 + 212 + #define pt_iommu_table pt_iommu_riscv_64 213 + 214 + /* The common struct is in the per-format common struct */ 215 + static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 216 + { 217 + return &container_of(iommu_table, struct pt_iommu_table, iommu) 218 + ->riscv_64pt.common; 219 + } 220 + 221 + static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 222 + { 223 + return &container_of(common, struct pt_iommu_table, riscv_64pt.common) 224 + ->iommu; 225 + } 226 + 227 + static inline int riscvpt_iommu_set_prot(struct pt_common *common, 228 + struct pt_write_attrs *attrs, 229 + unsigned int iommu_prot) 230 + { 231 + u64 pte; 232 + 233 + pte = RISCVPT_A | RISCVPT_U; 234 + if (iommu_prot & IOMMU_WRITE) 235 + pte |= RISCVPT_W | RISCVPT_R | RISCVPT_D; 236 + if (iommu_prot & IOMMU_READ) 237 + pte |= RISCVPT_R; 238 + if (!(iommu_prot & IOMMU_NOEXEC)) 239 + pte |= RISCVPT_X; 240 + 241 + /* Caller must specify a supported combination of flags */ 242 + if (unlikely((pte & (RISCVPT_X | RISCVPT_W | RISCVPT_R)) == 0)) 243 + return -EOPNOTSUPP; 244 + 245 + attrs->descriptor_bits = pte; 246 + return 0; 247 + } 248 + #define pt_iommu_set_prot riscvpt_iommu_set_prot 249 + 250 + static inline int 251 + riscvpt_iommu_fmt_init(struct pt_iommu_riscv_64 *iommu_table, 252 + const struct pt_iommu_riscv_64_cfg *cfg) 253 + { 254 + struct pt_riscv *table = &iommu_table->riscv_64pt; 255 + 256 + switch (cfg->common.hw_max_vasz_lg2) { 257 + case 39: 258 + pt_top_set_level(&table->common, 2); 259 + break; 260 + case 48: 261 + pt_top_set_level(&table->common, 3); 262 + break; 263 + case 57: 264 + pt_top_set_level(&table->common, 4); 265 + break; 266 + default: 267 + return -EINVAL; 268 + } 269 + table->common.max_oasz_lg2 = 270 + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); 271 + return 0; 272 + } 273 + #define pt_iommu_fmt_init riscvpt_iommu_fmt_init 274 + 275 + static inline void 276 + riscvpt_iommu_fmt_hw_info(struct pt_iommu_riscv_64 *table, 277 + const struct pt_range *top_range, 278 + struct pt_iommu_riscv_64_hw_info *info) 279 + { 280 + phys_addr_t top_phys = virt_to_phys(top_range->top_table); 281 + 282 + info->ppn = oalog2_div(top_phys, PT_GRANULE_LG2SZ); 283 + PT_WARN_ON(top_phys & ~PT_TOP_PHYS_MASK); 284 + 285 + /* 286 + * See Table 3. Encodings of iosatp.MODE field" for DC.tx.SXL = 0: 287 + * 8 = Sv39 = top level 2 288 + * 9 = Sv38 = top level 3 289 + * 10 = Sv57 = top level 4 290 + */ 291 + info->fsc_iosatp_mode = top_range->top_level + 6; 292 + } 293 + #define pt_iommu_fmt_hw_info riscvpt_iommu_fmt_hw_info 294 + 295 + #if defined(GENERIC_PT_KUNIT) 296 + static const struct pt_iommu_riscv_64_cfg riscv_64_kunit_fmt_cfgs[] = { 297 + [0] = { .common.features = BIT(PT_FEAT_RISCV_SVNAPOT_64K), 298 + .common.hw_max_oasz_lg2 = 56, 299 + .common.hw_max_vasz_lg2 = 39 }, 300 + [1] = { .common.features = 0, 301 + .common.hw_max_oasz_lg2 = 56, 302 + .common.hw_max_vasz_lg2 = 48 }, 303 + [2] = { .common.features = BIT(PT_FEAT_RISCV_SVNAPOT_64K), 304 + .common.hw_max_oasz_lg2 = 56, 305 + .common.hw_max_vasz_lg2 = 57 }, 306 + }; 307 + #define kunit_fmt_cfgs riscv_64_kunit_fmt_cfgs 308 + enum { 309 + KUNIT_FMT_FEATURES = BIT(PT_FEAT_RISCV_SVNAPOT_64K), 310 + }; 311 + #endif 312 + 313 + #endif

+105 -84

drivers/iommu/generic_pt/iommu_pt.h

··· 51 51 iommu_pages_stop_incoherent_list(free_list, 52 52 iommu_table->iommu_device); 53 53 54 - if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && 55 - iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { 56 - iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); 57 - /* 58 - * Note that the sync frees the gather's free list, so we must 59 - * not have any pages on that list that are covered by iova/len 60 - */ 54 + /* 55 + * If running in DMA-FQ mode then the unmap will be followed by an IOTLB 56 + * flush all so we need to optimize by never flushing the IOTLB here. 57 + * 58 + * For NO_GAPS the user gets to pick if flushing all or doing micro 59 + * flushes is better for their work load by choosing DMA vs DMA-FQ 60 + * operation. Drivers should also see shadow_on_flush. 61 + */ 62 + if (!iommu_iotlb_gather_queued(iotlb_gather)) { 63 + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && 64 + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { 65 + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); 66 + /* 67 + * Note that the sync frees the gather's free list, so 68 + * we must not have any pages on that list that are 69 + * covered by iova/len 70 + */ 71 + } 72 + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); 61 73 } 62 74 63 - iommu_iotlb_gather_add_range(iotlb_gather, iova, len); 64 75 iommu_pages_list_splice(free_list, &iotlb_gather->freelist); 65 76 } 66 77 ··· 477 466 pt_oaddr_t oa; 478 467 unsigned int leaf_pgsize_lg2; 479 468 unsigned int leaf_level; 469 + pt_vaddr_t num_leaves; 480 470 }; 481 471 482 472 /* ··· 530 518 static int __map_range_leaf(struct pt_range *range, void *arg, 531 519 unsigned int level, struct pt_table_p *table) 532 520 { 521 + struct pt_iommu *iommu_table = iommu_from_common(range->common); 533 522 struct pt_state pts = pt_init(range, level, table); 534 523 struct pt_iommu_map_args *map = arg; 535 524 unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; 536 525 unsigned int start_index; 537 526 pt_oaddr_t oa = map->oa; 527 + unsigned int num_leaves; 528 + unsigned int orig_end; 529 + pt_vaddr_t last_va; 538 530 unsigned int step; 539 531 bool need_contig; 540 532 int ret = 0; ··· 552 536 553 537 _pt_iter_first(&pts); 554 538 start_index = pts.index; 539 + orig_end = pts.end_index; 540 + if (pts.index + map->num_leaves < pts.end_index) { 541 + /* Need to stop in the middle of the table to change sizes */ 542 + pts.end_index = pts.index + map->num_leaves; 543 + num_leaves = 0; 544 + } else { 545 + num_leaves = map->num_leaves - (pts.end_index - pts.index); 546 + } 547 + 555 548 do { 556 549 pts.type = pt_load_entry_raw(&pts); 557 550 if (pts.type != PT_ENTRY_EMPTY || need_contig) { ··· 586 561 flush_writes_range(&pts, start_index, pts.index); 587 562 588 563 map->oa = oa; 589 - return ret; 564 + map->num_leaves = num_leaves; 565 + if (ret || num_leaves) 566 + return ret; 567 + 568 + /* range->va is not valid if we reached the end of the table */ 569 + pts.index -= step; 570 + pt_index_to_va(&pts); 571 + pts.index += step; 572 + last_va = range->va + log2_to_int(leaf_pgsize_lg2); 573 + 574 + if (last_va - 1 == range->last_va) { 575 + PT_WARN_ON(pts.index != orig_end); 576 + return 0; 577 + } 578 + 579 + /* 580 + * Reached a point where the page size changed, compute the new 581 + * parameters. 582 + */ 583 + map->leaf_pgsize_lg2 = pt_compute_best_pgsize( 584 + iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa); 585 + map->leaf_level = 586 + pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2); 587 + map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap, 588 + last_va, range->last_va, oa, 589 + map->leaf_pgsize_lg2); 590 + 591 + /* Didn't finish this table level, caller will repeat it */ 592 + if (pts.index != orig_end) { 593 + if (pts.index != start_index) 594 + pt_index_to_va(&pts); 595 + return -EAGAIN; 596 + } 597 + return 0; 590 598 } 591 599 592 600 static int __map_range(struct pt_range *range, void *arg, unsigned int level, ··· 642 584 if (pts.type != PT_ENTRY_EMPTY) 643 585 return -EADDRINUSE; 644 586 ret = pt_iommu_new_table(&pts, &map->attrs); 645 - if (ret) { 646 - /* 647 - * Racing with another thread installing a table 648 - */ 649 - if (ret == -EAGAIN) 650 - continue; 587 + /* EAGAIN on a race will loop again */ 588 + if (ret) 651 589 return ret; 652 - } 653 590 } else { 654 591 pts.table_lower = pt_table_ptr(&pts); 655 592 /* ··· 668 615 * The already present table can possibly be shared with another 669 616 * concurrent map. 670 617 */ 671 - if (map->leaf_level == level - 1) 672 - ret = pt_descend(&pts, arg, __map_range_leaf); 673 - else 674 - ret = pt_descend(&pts, arg, __map_range); 618 + do { 619 + if (map->leaf_level == level - 1) 620 + ret = pt_descend(&pts, arg, __map_range_leaf); 621 + else 622 + ret = pt_descend(&pts, arg, __map_range); 623 + } while (ret == -EAGAIN); 675 624 if (ret) 676 625 return ret; 677 626 ··· 681 626 pt_index_to_va(&pts); 682 627 if (pts.index >= pts.end_index) 683 628 break; 629 + 630 + /* 631 + * This level is currently running __map_range_leaf() which is 632 + * not correct if the target level has been updated to this 633 + * level. Have the caller invoke __map_range_leaf. 634 + */ 635 + if (map->leaf_level == level) 636 + return -EAGAIN; 684 637 } while (true); 685 638 return 0; 686 639 } ··· 860 797 static int do_map(struct pt_range *range, struct pt_common *common, 861 798 bool single_page, struct pt_iommu_map_args *map) 862 799 { 800 + int ret; 801 + 863 802 /* 864 803 * The __map_single_page() fast path does not support DMA_INCOHERENT 865 804 * flushing to keep its .text small. 866 805 */ 867 806 if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { 868 - int ret; 869 807 870 808 ret = pt_walk_range(range, __map_single_page, map); 871 809 if (ret != -EAGAIN) ··· 874 810 /* EAGAIN falls through to the full path */ 875 811 } 876 812 877 - if (map->leaf_level == range->top_level) 878 - return pt_walk_range(range, __map_range_leaf, map); 879 - return pt_walk_range(range, __map_range, map); 813 + do { 814 + if (map->leaf_level == range->top_level) 815 + ret = pt_walk_range(range, __map_range_leaf, map); 816 + else 817 + ret = pt_walk_range(range, __map_range, map); 818 + } while (ret == -EAGAIN); 819 + return ret; 880 820 } 881 821 882 - /** 883 - * map_pages() - Install translation for an IOVA range 884 - * @domain: Domain to manipulate 885 - * @iova: IO virtual address to start 886 - * @paddr: Physical/Output address to start 887 - * @pgsize: Length of each page 888 - * @pgcount: Length of the range in pgsize units starting from @iova 889 - * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 890 - * @gfp: GFP flags for any memory allocations 891 - * @mapped: Total bytes successfully mapped 892 - * 893 - * The range starting at IOVA will have paddr installed into it. The caller 894 - * must specify a valid pgsize and pgcount to segment the range into compatible 895 - * blocks. 896 - * 897 - * On error the caller will probably want to invoke unmap on the range from iova 898 - * up to the amount indicated by @mapped to return the table back to an 899 - * unchanged state. 900 - * 901 - * Context: The caller must hold a write range lock that includes the whole 902 - * range. 903 - * 904 - * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were 905 - * mapped are added to @mapped, @mapped is not zerod first. 906 - */ 907 - int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, 908 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 909 - int prot, gfp_t gfp, size_t *mapped) 822 + static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 823 + phys_addr_t paddr, dma_addr_t len, unsigned int prot, 824 + gfp_t gfp, size_t *mapped) 910 825 { 911 - struct pt_iommu *iommu_table = 912 - container_of(domain, struct pt_iommu, domain); 913 826 pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; 914 827 struct pt_common *common = common_from_iommu(iommu_table); 915 828 struct iommu_iotlb_gather iotlb_gather; 916 - pt_vaddr_t len = pgsize * pgcount; 917 829 struct pt_iommu_map_args map = { 918 830 .iotlb_gather = &iotlb_gather, 919 831 .oa = paddr, 920 - .leaf_pgsize_lg2 = vaffs(pgsize), 921 832 }; 922 833 bool single_page = false; 923 834 struct pt_range range; ··· 920 881 return ret; 921 882 922 883 /* Calculate target page size and level for the leaves */ 923 - if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && 924 - pgcount == 1) { 884 + if (pt_has_system_page_size(common) && len == PAGE_SIZE) { 925 885 PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); 926 886 if (log2_mod(iova | paddr, PAGE_SHIFT)) 927 887 return -ENXIO; 928 888 map.leaf_pgsize_lg2 = PAGE_SHIFT; 929 889 map.leaf_level = 0; 890 + map.num_leaves = 1; 930 891 single_page = true; 931 892 } else { 932 893 map.leaf_pgsize_lg2 = pt_compute_best_pgsize( ··· 935 896 return -ENXIO; 936 897 map.leaf_level = 937 898 pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); 899 + map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va, 900 + range.last_va, paddr, 901 + map.leaf_pgsize_lg2); 938 902 } 939 903 940 904 ret = check_map_range(iommu_table, &range, &map); ··· 960 918 *mapped += map.oa - paddr; 961 919 return ret; 962 920 } 963 - EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); 964 921 965 922 struct pt_unmap_args { 966 923 struct iommu_pages_list free_list; ··· 1061 1020 return ret; 1062 1021 } 1063 1022 1064 - /** 1065 - * unmap_pages() - Make a range of IOVA empty/not present 1066 - * @domain: Domain to manipulate 1067 - * @iova: IO virtual address to start 1068 - * @pgsize: Length of each page 1069 - * @pgcount: Length of the range in pgsize units starting from @iova 1070 - * @iotlb_gather: Gather struct that must be flushed on return 1071 - * 1072 - * unmap_pages() will remove a translation created by map_pages(). It cannot 1073 - * subdivide a mapping created by map_pages(), so it should be called with IOVA 1074 - * ranges that match those passed to map_pages(). The IOVA range can aggregate 1075 - * contiguous map_pages() calls so long as no individual range is split. 1076 - * 1077 - * Context: The caller must hold a write range lock that includes 1078 - * the whole range. 1079 - * 1080 - * Returns: Number of bytes of VA unmapped. iova + res will be the point 1081 - * unmapping stopped. 1082 - */ 1083 - size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, 1084 - size_t pgsize, size_t pgcount, 1023 + static size_t NS(unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 1024 + dma_addr_t len, 1085 1025 struct iommu_iotlb_gather *iotlb_gather) 1086 1026 { 1087 - struct pt_iommu *iommu_table = 1088 - container_of(domain, struct pt_iommu, domain); 1089 1027 struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( 1090 1028 unmap.free_list) }; 1091 - pt_vaddr_t len = pgsize * pgcount; 1092 1029 struct pt_range range; 1093 1030 int ret; 1094 1031 ··· 1081 1062 1082 1063 return unmap.unmapped; 1083 1064 } 1084 - EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); 1085 1065 1086 1066 static void NS(get_info)(struct pt_iommu *iommu_table, 1087 1067 struct pt_iommu_info *info) ··· 1128 1110 } 1129 1111 1130 1112 static const struct pt_iommu_ops NS(ops) = { 1113 + .map_range = NS(map_range), 1114 + .unmap_range = NS(unmap_range), 1131 1115 #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ 1132 1116 IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) 1133 1117 .set_dirty = NS(set_dirty), ··· 1192 1172 1193 1173 domain->type = __IOMMU_DOMAIN_PAGING; 1194 1174 domain->pgsize_bitmap = info.pgsize_bitmap; 1175 + domain->is_iommupt = true; 1195 1176 1196 1177 if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) 1197 1178 range = _pt_top_range(common,

+12

drivers/iommu/generic_pt/kunit_generic_pt.h

··· 312 312 } 313 313 } 314 314 315 + static void test_pgsz_count(struct kunit *test) 316 + { 317 + KUNIT_EXPECT_EQ(test, 318 + pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)), 319 + SZ_1G / SZ_4K); 320 + KUNIT_EXPECT_EQ(test, 321 + pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K, 322 + ilog2(SZ_4K)), 323 + (SZ_2M - SZ_4K) / SZ_4K); 324 + } 325 + 315 326 /* 316 327 * Check that pt_install_table() and pt_table_pa() match 317 328 */ ··· 781 770 KUNIT_CASE_FMT(test_init), 782 771 KUNIT_CASE_FMT(test_bitops), 783 772 KUNIT_CASE_FMT(test_best_pgsize), 773 + KUNIT_CASE_FMT(test_pgsz_count), 784 774 KUNIT_CASE_FMT(test_table_ptr), 785 775 KUNIT_CASE_FMT(test_max_va), 786 776 KUNIT_CASE_FMT(test_table_radix),

+22

drivers/iommu/generic_pt/pt_iter.h

··· 569 569 return pgsz_lg2; 570 570 } 571 571 572 + /* 573 + * Return the number of pgsize_lg2 leaf entries that can be mapped for 574 + * va to oa. This accounts for any requirement to reduce or increase the page 575 + * size across the VA range. 576 + */ 577 + static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, 578 + pt_vaddr_t last_va, pt_oaddr_t oa, 579 + unsigned int pgsize_lg2) 580 + { 581 + pt_vaddr_t len = last_va - va + 1; 582 + pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1); 583 + 584 + if (next_pgsizes) { 585 + unsigned int next_pgsize_lg2 = vaffs(next_pgsizes); 586 + 587 + if (log2_mod(va ^ oa, next_pgsize_lg2) == 0) 588 + len = min(len, log2_set_mod_max(va, next_pgsize_lg2) - 589 + va + 1); 590 + } 591 + return log2_div(len, pgsize_lg2); 592 + } 593 + 572 594 #define _PT_MAKE_CALL_LEVEL(fn) \ 573 595 static __always_inline int fn(struct pt_range *range, void *arg, \ 574 596 unsigned int level, \

+26 -27

drivers/iommu/intel/cache.c

··· 255 255 256 256 static unsigned long calculate_psi_aligned_address(unsigned long start, 257 257 unsigned long end, 258 - unsigned long *_pages, 259 258 unsigned long *_mask) 260 259 { 261 260 unsigned long pages = aligned_nrpages(start, end - start + 1); ··· 280 281 */ 281 282 shared_bits = ~(pfn ^ end_pfn) & ~bitmask; 282 283 mask = shared_bits ? __ffs(shared_bits) : MAX_AGAW_PFN_WIDTH; 283 - aligned_pages = 1UL << mask; 284 284 } 285 285 286 - *_pages = aligned_pages; 287 286 *_mask = mask; 288 287 289 288 return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask); ··· 327 330 qi_batch_increment_index(iommu, batch); 328 331 } 329 332 333 + static void qi_batch_add_piotlb_all(struct intel_iommu *iommu, u16 did, 334 + u32 pasid, struct qi_batch *batch) 335 + { 336 + qi_desc_piotlb_all(did, pasid, &batch->descs[batch->index]); 337 + qi_batch_increment_index(iommu, batch); 338 + } 339 + 330 340 static void qi_batch_add_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, 331 - u64 addr, unsigned long npages, bool ih, 341 + u64 addr, unsigned int size_order, bool ih, 332 342 struct qi_batch *batch) 333 343 { 334 - /* 335 - * npages == -1 means a PASID-selective invalidation, otherwise, 336 - * a positive value for Page-selective-within-PASID invalidation. 337 - * 0 is not a valid input. 338 - */ 339 - if (!npages) 340 - return; 341 - 342 - qi_desc_piotlb(did, pasid, addr, npages, ih, &batch->descs[batch->index]); 344 + qi_desc_piotlb(did, pasid, addr, size_order, ih, 345 + &batch->descs[batch->index]); 343 346 qi_batch_increment_index(iommu, batch); 344 347 } 345 348 ··· 368 371 } 369 372 370 373 static void cache_tag_flush_iotlb(struct dmar_domain *domain, struct cache_tag *tag, 371 - unsigned long addr, unsigned long pages, 372 - unsigned long mask, int ih) 374 + unsigned long addr, unsigned long mask, int ih) 373 375 { 374 376 struct intel_iommu *iommu = tag->iommu; 375 377 u64 type = DMA_TLB_PSI_FLUSH; 376 378 377 379 if (intel_domain_use_piotlb(domain)) { 378 - qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, addr, 379 - pages, ih, domain->qi_batch); 380 + if (mask >= MAX_AGAW_PFN_WIDTH) 381 + qi_batch_add_piotlb_all(iommu, tag->domain_id, 382 + tag->pasid, domain->qi_batch); 383 + else 384 + qi_batch_add_piotlb(iommu, tag->domain_id, tag->pasid, 385 + addr, mask, ih, domain->qi_batch); 380 386 return; 381 387 } 382 388 ··· 388 388 * is too big. 389 389 */ 390 390 if (!cap_pgsel_inv(iommu->cap) || 391 - mask > cap_max_amask_val(iommu->cap) || pages == -1) { 391 + mask > cap_max_amask_val(iommu->cap)) { 392 392 addr = 0; 393 393 mask = 0; 394 394 ih = 0; ··· 437 437 unsigned long end, int ih) 438 438 { 439 439 struct intel_iommu *iommu = NULL; 440 - unsigned long pages, mask, addr; 440 + unsigned long mask, addr; 441 441 struct cache_tag *tag; 442 442 unsigned long flags; 443 443 444 444 if (start == 0 && end == ULONG_MAX) { 445 445 addr = 0; 446 - pages = -1; 447 446 mask = MAX_AGAW_PFN_WIDTH; 448 447 } else { 449 - addr = calculate_psi_aligned_address(start, end, &pages, &mask); 448 + addr = calculate_psi_aligned_address(start, end, &mask); 450 449 } 451 450 452 451 spin_lock_irqsave(&domain->cache_lock, flags); ··· 457 458 switch (tag->type) { 458 459 case CACHE_TAG_IOTLB: 459 460 case CACHE_TAG_NESTING_IOTLB: 460 - cache_tag_flush_iotlb(domain, tag, addr, pages, mask, ih); 461 + cache_tag_flush_iotlb(domain, tag, addr, mask, ih); 461 462 break; 462 463 case CACHE_TAG_NESTING_DEVTLB: 463 464 /* ··· 475 476 break; 476 477 } 477 478 478 - trace_cache_tag_flush_range(tag, start, end, addr, pages, mask); 479 + trace_cache_tag_flush_range(tag, start, end, addr, mask); 479 480 } 480 481 qi_batch_flush_descs(iommu, domain->qi_batch); 481 482 spin_unlock_irqrestore(&domain->cache_lock, flags); ··· 505 506 unsigned long end) 506 507 { 507 508 struct intel_iommu *iommu = NULL; 508 - unsigned long pages, mask, addr; 509 + unsigned long mask, addr; 509 510 struct cache_tag *tag; 510 511 unsigned long flags; 511 512 512 - addr = calculate_psi_aligned_address(start, end, &pages, &mask); 513 + addr = calculate_psi_aligned_address(start, end, &mask); 513 514 514 515 spin_lock_irqsave(&domain->cache_lock, flags); 515 516 list_for_each_entry(tag, &domain->cache_tags, node) { ··· 525 526 526 527 if (tag->type == CACHE_TAG_IOTLB || 527 528 tag->type == CACHE_TAG_NESTING_IOTLB) 528 - cache_tag_flush_iotlb(domain, tag, addr, pages, mask, 0); 529 + cache_tag_flush_iotlb(domain, tag, addr, mask, 0); 529 530 530 - trace_cache_tag_flush_range_np(tag, start, end, addr, pages, mask); 531 + trace_cache_tag_flush_range_np(tag, start, end, addr, mask); 531 532 } 532 533 qi_batch_flush_descs(iommu, domain->qi_batch); 533 534 spin_unlock_irqrestore(&domain->cache_lock, flags);

+9 -9

drivers/iommu/intel/debugfs.c

··· 133 133 */ 134 134 raw_spin_lock_irqsave(&iommu->register_lock, flag); 135 135 for (i = 0 ; i < ARRAY_SIZE(iommu_regs_32); i++) { 136 - value = dmar_readl(iommu->reg + iommu_regs_32[i].offset); 136 + value = readl(iommu->reg + iommu_regs_32[i].offset); 137 137 seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", 138 138 iommu_regs_32[i].regs, iommu_regs_32[i].offset, 139 139 value); 140 140 } 141 141 for (i = 0 ; i < ARRAY_SIZE(iommu_regs_64); i++) { 142 - value = dmar_readq(iommu->reg + iommu_regs_64[i].offset); 142 + value = readq(iommu->reg + iommu_regs_64[i].offset); 143 143 seq_printf(m, "%-16s\t0x%02x\t\t0x%016llx\n", 144 144 iommu_regs_64[i].regs, iommu_regs_64[i].offset, 145 145 value); ··· 247 247 tbl_wlk.ctx_entry = context; 248 248 m->private = &tbl_wlk; 249 249 250 - if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) { 250 + if (readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) { 251 251 pasid_dir_ptr = context->lo & VTD_PAGE_MASK; 252 252 pasid_dir_size = get_pasid_dir_size(context); 253 253 pasid_dir_walk(m, pasid_dir_ptr, pasid_dir_size); ··· 285 285 286 286 rcu_read_lock(); 287 287 for_each_active_iommu(iommu, drhd) { 288 - sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); 288 + sts = readl(iommu->reg + DMAR_GSTS_REG); 289 289 if (!(sts & DMA_GSTS_TES)) { 290 290 seq_printf(m, "DMA Remapping is not enabled on %s\n", 291 291 iommu->name); ··· 364 364 if (seg != iommu->segment) 365 365 continue; 366 366 367 - sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); 367 + sts = readl(iommu->reg + DMAR_GSTS_REG); 368 368 if (!(sts & DMA_GSTS_TES)) { 369 369 seq_printf(m, "DMA Remapping is not enabled on %s\n", 370 370 iommu->name); 371 371 continue; 372 372 } 373 - if (dmar_readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) 373 + if (readq(iommu->reg + DMAR_RTADDR_REG) & DMA_RTADDR_SMT) 374 374 scalable = true; 375 375 else 376 376 scalable = false; ··· 538 538 raw_spin_lock_irqsave(&qi->q_lock, flags); 539 539 seq_printf(m, " Base: 0x%llx\tHead: %lld\tTail: %lld\n", 540 540 (u64)virt_to_phys(qi->desc), 541 - dmar_readq(iommu->reg + DMAR_IQH_REG) >> shift, 542 - dmar_readq(iommu->reg + DMAR_IQT_REG) >> shift); 541 + readq(iommu->reg + DMAR_IQH_REG) >> shift, 542 + readq(iommu->reg + DMAR_IQT_REG) >> shift); 543 543 invalidation_queue_entry_show(m, iommu); 544 544 raw_spin_unlock_irqrestore(&qi->q_lock, flags); 545 545 seq_putc(m, '\n'); ··· 620 620 seq_printf(m, "Remapped Interrupt supported on IOMMU: %s\n", 621 621 iommu->name); 622 622 623 - sts = dmar_readl(iommu->reg + DMAR_GSTS_REG); 623 + sts = readl(iommu->reg + DMAR_GSTS_REG); 624 624 if (iommu->ir_table && (sts & DMA_GSTS_IRES)) { 625 625 irta = virt_to_phys(iommu->ir_table->base); 626 626 seq_printf(m, " IR table address:%llx\n", irta);

+16 -27

drivers/iommu/intel/dmar.c

··· 899 899 return -EINVAL; 900 900 } 901 901 902 - cap = dmar_readq(addr + DMAR_CAP_REG); 903 - ecap = dmar_readq(addr + DMAR_ECAP_REG); 902 + cap = readq(addr + DMAR_CAP_REG); 903 + ecap = readq(addr + DMAR_ECAP_REG); 904 904 905 905 if (arg) 906 906 iounmap(addr); ··· 982 982 goto release; 983 983 } 984 984 985 - iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG); 986 - iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG); 985 + iommu->cap = readq(iommu->reg + DMAR_CAP_REG); 986 + iommu->ecap = readq(iommu->reg + DMAR_ECAP_REG); 987 987 988 988 if (iommu->cap == (uint64_t)-1 && iommu->ecap == (uint64_t)-1) { 989 989 err = -EINVAL; ··· 1017 1017 int i; 1018 1018 1019 1019 for (i = 0; i < DMA_MAX_NUM_ECMDCAP; i++) { 1020 - iommu->ecmdcap[i] = dmar_readq(iommu->reg + DMAR_ECCAP_REG + 1021 - i * DMA_ECMD_REG_STEP); 1020 + iommu->ecmdcap[i] = readq(iommu->reg + DMAR_ECCAP_REG + 1021 + i * DMA_ECMD_REG_STEP); 1022 1022 } 1023 1023 } 1024 1024 ··· 1239 1239 1240 1240 static void qi_dump_fault(struct intel_iommu *iommu, u32 fault) 1241 1241 { 1242 - unsigned int head = dmar_readl(iommu->reg + DMAR_IQH_REG); 1243 - u64 iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG); 1242 + unsigned int head = readl(iommu->reg + DMAR_IQH_REG); 1243 + u64 iqe_err = readq(iommu->reg + DMAR_IQER_REG); 1244 1244 struct qi_desc *desc = iommu->qi->desc + head; 1245 1245 1246 1246 if (fault & DMA_FSTS_IQE) ··· 1321 1321 * SID field is valid only when the ITE field is Set in FSTS_REG 1322 1322 * see Intel VT-d spec r4.1, section 11.4.9.9 1323 1323 */ 1324 - iqe_err = dmar_readq(iommu->reg + DMAR_IQER_REG); 1324 + iqe_err = readq(iommu->reg + DMAR_IQER_REG); 1325 1325 ite_sid = DMAR_IQER_REG_ITESID(iqe_err); 1326 1326 1327 1327 writel(DMA_FSTS_ITE, iommu->reg + DMAR_FSTS_REG); ··· 1550 1550 qi_submit_sync(iommu, &desc, 1, 0); 1551 1551 } 1552 1552 1553 - /* PASID-based IOTLB invalidation */ 1554 - void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, 1555 - unsigned long npages, bool ih) 1553 + /* PASID-selective IOTLB invalidation */ 1554 + void qi_flush_piotlb_all(struct intel_iommu *iommu, u16 did, u32 pasid) 1556 1555 { 1557 - struct qi_desc desc = {.qw2 = 0, .qw3 = 0}; 1556 + struct qi_desc desc = {}; 1558 1557 1559 - /* 1560 - * npages == -1 means a PASID-selective invalidation, otherwise, 1561 - * a positive value for Page-selective-within-PASID invalidation. 1562 - * 0 is not a valid input. 1563 - */ 1564 - if (WARN_ON(!npages)) { 1565 - pr_err("Invalid input npages = %ld\n", npages); 1566 - return; 1567 - } 1568 - 1569 - qi_desc_piotlb(did, pasid, addr, npages, ih, &desc); 1558 + qi_desc_piotlb_all(did, pasid, &desc); 1570 1559 qi_submit_sync(iommu, &desc, 1, 0); 1571 1560 } 1572 1561 ··· 1650 1661 /* write zero to the tail reg */ 1651 1662 writel(0, iommu->reg + DMAR_IQT_REG); 1652 1663 1653 - dmar_writeq(iommu->reg + DMAR_IQA_REG, val); 1664 + writeq(val, iommu->reg + DMAR_IQA_REG); 1654 1665 1655 1666 iommu->gcmd |= DMA_GCMD_QIE; 1656 1667 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); ··· 1969 1980 source_id = dma_frcd_source_id(data); 1970 1981 1971 1982 pasid_present = dma_frcd_pasid_present(data); 1972 - guest_addr = dmar_readq(iommu->reg + reg + 1973 - fault_index * PRIMARY_FAULT_REG_LEN); 1983 + guest_addr = readq(iommu->reg + reg + 1984 + fault_index * PRIMARY_FAULT_REG_LEN); 1974 1985 guest_addr = dma_frcd_page_addr(guest_addr); 1975 1986 } 1976 1987

+31 -27

drivers/iommu/intel/iommu.c

··· 697 697 addr |= DMA_RTADDR_SMT; 698 698 699 699 raw_spin_lock_irqsave(&iommu->register_lock, flag); 700 - dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr); 700 + writeq(addr, iommu->reg + DMAR_RTADDR_REG); 701 701 702 702 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); 703 703 ··· 765 765 val |= DMA_CCMD_ICC; 766 766 767 767 raw_spin_lock_irqsave(&iommu->register_lock, flag); 768 - dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); 768 + writeq(val, iommu->reg + DMAR_CCMD_REG); 769 769 770 770 /* Make sure hardware complete it */ 771 771 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, 772 - dmar_readq, (!(val & DMA_CCMD_ICC)), val); 772 + readq, (!(val & DMA_CCMD_ICC)), val); 773 773 774 774 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 775 775 } ··· 806 806 raw_spin_lock_irqsave(&iommu->register_lock, flag); 807 807 /* Note: Only uses first TLB reg currently */ 808 808 if (val_iva) 809 - dmar_writeq(iommu->reg + tlb_offset, val_iva); 810 - dmar_writeq(iommu->reg + tlb_offset + 8, val); 809 + writeq(val_iva, iommu->reg + tlb_offset); 810 + writeq(val, iommu->reg + tlb_offset + 8); 811 811 812 812 /* Make sure hardware complete it */ 813 813 IOMMU_WAIT_OP(iommu, tlb_offset + 8, 814 - dmar_readq, (!(val & DMA_TLB_IVT)), val); 814 + readq, (!(val & DMA_TLB_IVT)), val); 815 815 816 816 raw_spin_unlock_irqrestore(&iommu->register_lock, flag); 817 817 ··· 1533 1533 int bus, ret; 1534 1534 bool new_ext, ext; 1535 1535 1536 - rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG); 1536 + rtaddr_reg = readq(iommu->reg + DMAR_RTADDR_REG); 1537 1537 ext = !!(rtaddr_reg & DMA_RTADDR_SMT); 1538 1538 new_ext = !!sm_supported(iommu); 1539 1539 ··· 3212 3212 3213 3213 switch (cap) { 3214 3214 case IOMMU_CAP_CACHE_COHERENCY: 3215 - case IOMMU_CAP_DEFERRED_FLUSH: 3216 3215 return true; 3217 3216 case IOMMU_CAP_PRE_BOOT_PROTECTION: 3218 3217 return dmar_platform_optin(); ··· 3219 3220 return ecap_sc_support(info->iommu->ecap); 3220 3221 case IOMMU_CAP_DIRTY_TRACKING: 3221 3222 return ssads_supported(info->iommu); 3223 + case IOMMU_CAP_PCI_ATS_SUPPORTED: 3224 + return info->ats_supported; 3222 3225 default: 3223 3226 return false; 3224 3227 } ··· 3619 3618 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) 3620 3619 return -EOPNOTSUPP; 3621 3620 3622 - if (domain->dirty_ops) 3623 - return -EINVAL; 3624 - 3625 3621 if (context_copied(iommu, info->bus, info->devfn)) 3626 3622 return -EBUSY; 3627 3623 ··· 3682 3684 return vtd; 3683 3685 } 3684 3686 3685 - /* 3686 - * Set dirty tracking for the device list of a domain. The caller must 3687 - * hold the domain->lock when calling it. 3688 - */ 3689 - static int device_set_dirty_tracking(struct list_head *devices, bool enable) 3687 + /* Set dirty tracking for the devices that the domain has been attached. */ 3688 + static int domain_set_dirty_tracking(struct dmar_domain *domain, bool enable) 3690 3689 { 3691 3690 struct device_domain_info *info; 3691 + struct dev_pasid_info *dev_pasid; 3692 3692 int ret = 0; 3693 3693 3694 - list_for_each_entry(info, devices, link) { 3694 + lockdep_assert_held(&domain->lock); 3695 + 3696 + list_for_each_entry(info, &domain->devices, link) { 3695 3697 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 3696 3698 IOMMU_NO_PASID, enable); 3699 + if (ret) 3700 + return ret; 3701 + } 3702 + 3703 + list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) { 3704 + info = dev_iommu_priv_get(dev_pasid->dev); 3705 + ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev, 3706 + dev_pasid->pasid, enable); 3697 3707 if (ret) 3698 3708 break; 3699 3709 } ··· 3719 3713 spin_lock(&domain->s1_lock); 3720 3714 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 3721 3715 spin_lock_irqsave(&s1_domain->lock, flags); 3722 - ret = device_set_dirty_tracking(&s1_domain->devices, enable); 3716 + ret = domain_set_dirty_tracking(s1_domain, enable); 3723 3717 spin_unlock_irqrestore(&s1_domain->lock, flags); 3724 3718 if (ret) 3725 3719 goto err_unwind; ··· 3730 3724 err_unwind: 3731 3725 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) { 3732 3726 spin_lock_irqsave(&s1_domain->lock, flags); 3733 - device_set_dirty_tracking(&s1_domain->devices, 3734 - domain->dirty_tracking); 3727 + domain_set_dirty_tracking(s1_domain, domain->dirty_tracking); 3735 3728 spin_unlock_irqrestore(&s1_domain->lock, flags); 3736 3729 } 3737 3730 spin_unlock(&domain->s1_lock); ··· 3747 3742 if (dmar_domain->dirty_tracking == enable) 3748 3743 goto out_unlock; 3749 3744 3750 - ret = device_set_dirty_tracking(&dmar_domain->devices, enable); 3745 + ret = domain_set_dirty_tracking(dmar_domain, enable); 3751 3746 if (ret) 3752 3747 goto err_unwind; 3753 3748 ··· 3764 3759 return 0; 3765 3760 3766 3761 err_unwind: 3767 - device_set_dirty_tracking(&dmar_domain->devices, 3768 - dmar_domain->dirty_tracking); 3762 + domain_set_dirty_tracking(dmar_domain, dmar_domain->dirty_tracking); 3769 3763 spin_unlock(&dmar_domain->lock); 3770 3764 return ret; 3771 3765 } ··· 4189 4185 4190 4186 raw_spin_lock_irqsave(&iommu->register_lock, flags); 4191 4187 4192 - res = dmar_readq(iommu->reg + DMAR_ECRSP_REG); 4188 + res = readq(iommu->reg + DMAR_ECRSP_REG); 4193 4189 if (res & DMA_ECMD_ECRSP_IP) { 4194 4190 ret = -EBUSY; 4195 4191 goto err; ··· 4202 4198 * - It's not invoked in any critical path. The extra MMIO 4203 4199 * write doesn't bring any performance concerns. 4204 4200 */ 4205 - dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob); 4206 - dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT)); 4201 + writeq(ob, iommu->reg + DMAR_ECEO_REG); 4202 + writeq(ecmd | (oa << DMA_ECMD_OA_SHIFT), iommu->reg + DMAR_ECMD_REG); 4207 4203 4208 - IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq, 4204 + IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, readq, 4209 4205 !(res & DMA_ECMD_ECRSP_IP), res); 4210 4206 4211 4207 if (res & DMA_ECMD_ECRSP_IP) {

+18 -29

drivers/iommu/intel/iommu.h

··· 148 148 149 149 #define OFFSET_STRIDE (9) 150 150 151 - #define dmar_readq(a) readq(a) 152 - #define dmar_writeq(a,v) writeq(v,a) 153 - #define dmar_readl(a) readl(a) 154 - #define dmar_writel(a, v) writel(v, a) 155 - 156 151 #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) 157 152 #define DMAR_VER_MINOR(v) ((v) & 0x0f) 158 153 ··· 1077 1082 desc->qw3 = 0; 1078 1083 } 1079 1084 1085 + /* PASID-selective IOTLB invalidation */ 1086 + static inline void qi_desc_piotlb_all(u16 did, u32 pasid, struct qi_desc *desc) 1087 + { 1088 + desc->qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | 1089 + QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | QI_EIOTLB_TYPE; 1090 + desc->qw1 = 0; 1091 + } 1092 + 1093 + /* Page-selective-within-PASID IOTLB invalidation */ 1080 1094 static inline void qi_desc_piotlb(u16 did, u32 pasid, u64 addr, 1081 - unsigned long npages, bool ih, 1095 + unsigned int size_order, bool ih, 1082 1096 struct qi_desc *desc) 1083 1097 { 1084 - if (npages == -1) { 1085 - desc->qw0 = QI_EIOTLB_PASID(pasid) | 1086 - QI_EIOTLB_DID(did) | 1087 - QI_EIOTLB_GRAN(QI_GRAN_NONG_PASID) | 1088 - QI_EIOTLB_TYPE; 1089 - desc->qw1 = 0; 1090 - } else { 1091 - int mask = ilog2(__roundup_pow_of_two(npages)); 1092 - unsigned long align = (1ULL << (VTD_PAGE_SHIFT + mask)); 1093 - 1094 - if (WARN_ON_ONCE(!IS_ALIGNED(addr, align))) 1095 - addr = ALIGN_DOWN(addr, align); 1096 - 1097 - desc->qw0 = QI_EIOTLB_PASID(pasid) | 1098 - QI_EIOTLB_DID(did) | 1099 - QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | 1100 - QI_EIOTLB_TYPE; 1101 - desc->qw1 = QI_EIOTLB_ADDR(addr) | 1102 - QI_EIOTLB_IH(ih) | 1103 - QI_EIOTLB_AM(mask); 1104 - } 1098 + /* 1099 + * calculate_psi_aligned_address() must be used for addr and size_order 1100 + */ 1101 + desc->qw0 = QI_EIOTLB_PASID(pasid) | QI_EIOTLB_DID(did) | 1102 + QI_EIOTLB_GRAN(QI_GRAN_PSI_PASID) | QI_EIOTLB_TYPE; 1103 + desc->qw1 = QI_EIOTLB_ADDR(addr) | QI_EIOTLB_IH(ih) | 1104 + QI_EIOTLB_AM(size_order); 1105 1105 } 1106 1106 1107 1107 static inline void qi_desc_dev_iotlb_pasid(u16 sid, u16 pfsid, u32 pasid, ··· 1158 1168 void qi_flush_dev_iotlb(struct intel_iommu *iommu, u16 sid, u16 pfsid, 1159 1169 u16 qdep, u64 addr, unsigned mask); 1160 1170 1161 - void qi_flush_piotlb(struct intel_iommu *iommu, u16 did, u32 pasid, u64 addr, 1162 - unsigned long npages, bool ih); 1171 + void qi_flush_piotlb_all(struct intel_iommu *iommu, u16 did, u32 pasid); 1163 1172 1164 1173 void qi_flush_dev_iotlb_pasid(struct intel_iommu *iommu, u16 sid, u16 pfsid, 1165 1174 u32 pasid, u16 qdep, u64 addr,

+3 -3

drivers/iommu/intel/irq_remapping.c

··· 422 422 u64 irta; 423 423 424 424 /* Check whether the old ir-table has the same size as ours */ 425 - irta = dmar_readq(iommu->reg + DMAR_IRTA_REG); 425 + irta = readq(iommu->reg + DMAR_IRTA_REG); 426 426 if ((irta & INTR_REMAP_TABLE_REG_SIZE_MASK) 427 427 != INTR_REMAP_TABLE_REG_SIZE) 428 428 return -EINVAL; ··· 465 465 466 466 raw_spin_lock_irqsave(&iommu->register_lock, flags); 467 467 468 - dmar_writeq(iommu->reg + DMAR_IRTA_REG, 469 - (addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE); 468 + writeq((addr) | IR_X2APIC_MODE(mode) | INTR_REMAP_TABLE_REG_SIZE, 469 + iommu->reg + DMAR_IRTA_REG); 470 470 471 471 /* Set interrupt-remapping table pointer */ 472 472 writel(iommu->gcmd | DMA_GCMD_SIRTP, iommu->reg + DMAR_GCMD_REG);

+3 -3

drivers/iommu/intel/pasid.c

··· 282 282 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 283 283 284 284 if (pgtt == PASID_ENTRY_PGTT_PT || pgtt == PASID_ENTRY_PGTT_FL_ONLY) 285 - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 285 + qi_flush_piotlb_all(iommu, did, pasid); 286 286 else 287 287 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); 288 288 ··· 308 308 309 309 if (cap_caching_mode(iommu->cap)) { 310 310 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 311 - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 311 + qi_flush_piotlb_all(iommu, did, pasid); 312 312 } else { 313 313 iommu_flush_write_buffer(iommu); 314 314 } ··· 342 342 * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions 343 343 */ 344 344 pasid_cache_invalidation_with_pasid(iommu, did, pasid); 345 - qi_flush_piotlb(iommu, did, pasid, 0, -1, 0); 345 + qi_flush_piotlb_all(iommu, did, pasid); 346 346 347 347 devtlb_invalidation_with_pasid(iommu, dev, pasid); 348 348 }

+25 -25

drivers/iommu/intel/perfmon.c

··· 99 99 #define iommu_pmu_set_filter(_name, _config, _filter, _idx, _econfig) \ 100 100 { \ 101 101 if ((iommu_pmu->filter & _filter) && iommu_pmu_en_##_name(_econfig)) { \ 102 - dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ 103 - IOMMU_PMU_CFG_SIZE + \ 104 - (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \ 105 - iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\ 102 + writel(iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN, \ 103 + iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ 104 + IOMMU_PMU_CFG_SIZE + \ 105 + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET); \ 106 106 } \ 107 107 } 108 108 109 109 #define iommu_pmu_clear_filter(_filter, _idx) \ 110 110 { \ 111 111 if (iommu_pmu->filter & _filter) { \ 112 - dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ 113 - IOMMU_PMU_CFG_SIZE + \ 114 - (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET, \ 115 - 0); \ 112 + writel(0, \ 113 + iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET + \ 114 + IOMMU_PMU_CFG_SIZE + \ 115 + (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET); \ 116 116 } \ 117 117 } 118 118 ··· 307 307 308 308 again: 309 309 prev_count = local64_read(&hwc->prev_count); 310 - new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx)); 310 + new_count = readq(iommu_event_base(iommu_pmu, hwc->idx)); 311 311 if (local64_xchg(&hwc->prev_count, new_count) != prev_count) 312 312 goto again; 313 313 ··· 340 340 hwc->state = 0; 341 341 342 342 /* Always reprogram the period */ 343 - count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx)); 343 + count = readq(iommu_event_base(iommu_pmu, hwc->idx)); 344 344 local64_set((&hwc->prev_count), count); 345 345 346 346 /* ··· 411 411 hwc->idx = idx; 412 412 413 413 /* config events */ 414 - dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config); 414 + writeq(hwc->config, iommu_config_base(iommu_pmu, idx)); 415 415 416 416 iommu_pmu_set_filter(requester_id, event->attr.config1, 417 417 IOMMU_PMU_FILTER_REQUESTER_ID, idx, ··· 496 496 * Two counters may be overflowed very close. Always check 497 497 * whether there are more to handle. 498 498 */ 499 - while ((status = dmar_readq(iommu_pmu->overflow))) { 499 + while ((status = readq(iommu_pmu->overflow))) { 500 500 for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) { 501 501 /* 502 502 * Find the assigned event of the counter. ··· 510 510 iommu_pmu_event_update(event); 511 511 } 512 512 513 - dmar_writeq(iommu_pmu->overflow, status); 513 + writeq(status, iommu_pmu->overflow); 514 514 } 515 515 } 516 516 ··· 518 518 { 519 519 struct intel_iommu *iommu = dev_id; 520 520 521 - if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG)) 521 + if (!readl(iommu->reg + DMAR_PERFINTRSTS_REG)) 522 522 return IRQ_NONE; 523 523 524 524 iommu_pmu_counter_overflow(iommu->pmu); 525 525 526 526 /* Clear the status bit */ 527 - dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS); 527 + writel(DMA_PERFINTRSTS_PIS, iommu->reg + DMAR_PERFINTRSTS_REG); 528 528 529 529 return IRQ_HANDLED; 530 530 } ··· 555 555 static inline void __iomem * 556 556 get_perf_reg_address(struct intel_iommu *iommu, u32 offset) 557 557 { 558 - u32 off = dmar_readl(iommu->reg + offset); 558 + u32 off = readl(iommu->reg + offset); 559 559 560 560 return iommu->reg + off; 561 561 } ··· 574 574 if (!cap_ecmds(iommu->cap)) 575 575 return -ENODEV; 576 576 577 - perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG); 577 + perfcap = readq(iommu->reg + DMAR_PERFCAP_REG); 578 578 /* The performance monitoring is not supported. */ 579 579 if (!perfcap) 580 580 return -ENODEV; ··· 617 617 for (i = 0; i < iommu_pmu->num_eg; i++) { 618 618 u64 pcap; 619 619 620 - pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG + 621 - i * IOMMU_PMU_CAP_REGS_STEP); 620 + pcap = readq(iommu->reg + DMAR_PERFEVNTCAP_REG + 621 + i * IOMMU_PMU_CAP_REGS_STEP); 622 622 iommu_pmu->evcap[i] = pecap_es(pcap); 623 623 } 624 624 ··· 651 651 * Width. 652 652 */ 653 653 for (i = 0; i < iommu_pmu->num_cntr; i++) { 654 - cap = dmar_readl(iommu_pmu->cfg_reg + 655 - i * IOMMU_PMU_CFG_OFFSET + 656 - IOMMU_PMU_CFG_CNTRCAP_OFFSET); 654 + cap = readl(iommu_pmu->cfg_reg + 655 + i * IOMMU_PMU_CFG_OFFSET + 656 + IOMMU_PMU_CFG_CNTRCAP_OFFSET); 657 657 if (!iommu_cntrcap_pcc(cap)) 658 658 continue; 659 659 ··· 675 675 676 676 /* Override with per-counter event capabilities */ 677 677 for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) { 678 - cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET + 679 - IOMMU_PMU_CFG_CNTREVCAP_OFFSET + 680 - (j * IOMMU_PMU_OFF_REGS_STEP)); 678 + cap = readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET + 679 + IOMMU_PMU_CFG_CNTREVCAP_OFFSET + 680 + (j * IOMMU_PMU_OFF_REGS_STEP)); 681 681 iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap); 682 682 /* 683 683 * Some events may only be supported by a specific counter.

+14 -14

drivers/iommu/intel/prq.c

··· 81 81 */ 82 82 prq_retry: 83 83 reinit_completion(&iommu->prq_complete); 84 - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 85 - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 84 + tail = readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 85 + head = readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 86 86 while (head != tail) { 87 87 struct page_req_dsc *req; 88 88 ··· 113 113 qi_desc_dev_iotlb(sid, info->pfsid, info->ats_qdep, 0, 114 114 MAX_AGAW_PFN_WIDTH, &desc[2]); 115 115 } else { 116 - qi_desc_piotlb(did, pasid, 0, -1, 0, &desc[1]); 116 + qi_desc_piotlb_all(did, pasid, &desc[1]); 117 117 qi_desc_dev_iotlb_pasid(sid, info->pfsid, pasid, info->ats_qdep, 118 118 0, MAX_AGAW_PFN_WIDTH, &desc[2]); 119 119 } ··· 208 208 */ 209 209 writel(DMA_PRS_PPR, iommu->reg + DMAR_PRS_REG); 210 210 211 - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 212 - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 211 + tail = readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 212 + head = readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 213 213 handled = (head != tail); 214 214 while (head != tail) { 215 215 req = &iommu->prq[head / sizeof(*req)]; ··· 259 259 head = (head + sizeof(*req)) & PRQ_RING_MASK; 260 260 } 261 261 262 - dmar_writeq(iommu->reg + DMAR_PQH_REG, tail); 262 + writeq(tail, iommu->reg + DMAR_PQH_REG); 263 263 264 264 /* 265 265 * Clear the page request overflow bit and wake up all threads that ··· 268 268 if (readl(iommu->reg + DMAR_PRS_REG) & DMA_PRS_PRO) { 269 269 pr_info_ratelimited("IOMMU: %s: PRQ overflow detected\n", 270 270 iommu->name); 271 - head = dmar_readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 272 - tail = dmar_readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 271 + head = readq(iommu->reg + DMAR_PQH_REG) & PRQ_RING_MASK; 272 + tail = readq(iommu->reg + DMAR_PQT_REG) & PRQ_RING_MASK; 273 273 if (head == tail) { 274 274 iopf_queue_discard_partial(iommu->iopf_queue); 275 275 writel(DMA_PRS_PRO, iommu->reg + DMAR_PRS_REG); ··· 325 325 iommu->name); 326 326 goto free_iopfq; 327 327 } 328 - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 329 - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 330 - dmar_writeq(iommu->reg + DMAR_PQA_REG, virt_to_phys(iommu->prq) | PRQ_ORDER); 328 + writeq(0ULL, iommu->reg + DMAR_PQH_REG); 329 + writeq(0ULL, iommu->reg + DMAR_PQT_REG); 330 + writeq(virt_to_phys(iommu->prq) | PRQ_ORDER, iommu->reg + DMAR_PQA_REG); 331 331 332 332 init_completion(&iommu->prq_complete); 333 333 ··· 348 348 349 349 int intel_iommu_finish_prq(struct intel_iommu *iommu) 350 350 { 351 - dmar_writeq(iommu->reg + DMAR_PQH_REG, 0ULL); 352 - dmar_writeq(iommu->reg + DMAR_PQT_REG, 0ULL); 353 - dmar_writeq(iommu->reg + DMAR_PQA_REG, 0ULL); 351 + writeq(0ULL, iommu->reg + DMAR_PQH_REG); 352 + writeq(0ULL, iommu->reg + DMAR_PQT_REG); 353 + writeq(0ULL, iommu->reg + DMAR_PQA_REG); 354 354 355 355 if (iommu->pr_irq) { 356 356 free_irq(iommu->pr_irq, iommu);

+8 -10

drivers/iommu/intel/trace.h

··· 132 132 133 133 DECLARE_EVENT_CLASS(cache_tag_flush, 134 134 TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, 135 - unsigned long addr, unsigned long pages, unsigned long mask), 136 - TP_ARGS(tag, start, end, addr, pages, mask), 135 + unsigned long addr, unsigned long mask), 136 + TP_ARGS(tag, start, end, addr, mask), 137 137 TP_STRUCT__entry( 138 138 __string(iommu, tag->iommu->name) 139 139 __string(dev, dev_name(tag->dev)) ··· 143 143 __field(unsigned long, start) 144 144 __field(unsigned long, end) 145 145 __field(unsigned long, addr) 146 - __field(unsigned long, pages) 147 146 __field(unsigned long, mask) 148 147 ), 149 148 TP_fast_assign( ··· 154 155 __entry->start = start; 155 156 __entry->end = end; 156 157 __entry->addr = addr; 157 - __entry->pages = pages; 158 158 __entry->mask = mask; 159 159 ), 160 - TP_printk("%s %s[%d] type %s did %d [0x%lx-0x%lx] addr 0x%lx pages 0x%lx mask 0x%lx", 160 + TP_printk("%s %s[%d] type %s did %d [0x%lx-0x%lx] addr 0x%lx mask 0x%lx", 161 161 __get_str(iommu), __get_str(dev), __entry->pasid, 162 162 __print_symbolic(__entry->type, 163 163 { CACHE_TAG_IOTLB, "iotlb" }, ··· 164 166 { CACHE_TAG_NESTING_IOTLB, "nesting_iotlb" }, 165 167 { CACHE_TAG_NESTING_DEVTLB, "nesting_devtlb" }), 166 168 __entry->domain_id, __entry->start, __entry->end, 167 - __entry->addr, __entry->pages, __entry->mask 169 + __entry->addr, __entry->mask 168 170 ) 169 171 ); 170 172 171 173 DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range, 172 174 TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, 173 - unsigned long addr, unsigned long pages, unsigned long mask), 174 - TP_ARGS(tag, start, end, addr, pages, mask) 175 + unsigned long addr, unsigned long mask), 176 + TP_ARGS(tag, start, end, addr, mask) 175 177 ); 176 178 177 179 DEFINE_EVENT(cache_tag_flush, cache_tag_flush_range_np, 178 180 TP_PROTO(struct cache_tag *tag, unsigned long start, unsigned long end, 179 - unsigned long addr, unsigned long pages, unsigned long mask), 180 - TP_ARGS(tag, start, end, addr, pages, mask) 181 + unsigned long addr, unsigned long mask), 182 + TP_ARGS(tag, start, end, addr, mask) 181 183 ); 182 184 #endif /* _TRACE_INTEL_IOMMU_H */ 183 185

+52 -14

drivers/iommu/iommu.c

··· 34 34 #include <linux/sched/mm.h> 35 35 #include <linux/msi.h> 36 36 #include <uapi/linux/iommufd.h> 37 + #include <linux/generic_pt/iommu.h> 37 38 38 39 #include "dma-iommu.h" 39 40 #include "iommu-priv.h" ··· 2573 2572 return pgsize; 2574 2573 } 2575 2574 2576 - int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, 2577 - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 2575 + static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, 2576 + unsigned long iova, phys_addr_t paddr, 2577 + size_t size, int prot, gfp_t gfp) 2578 2578 { 2579 2579 const struct iommu_domain_ops *ops = domain->ops; 2580 2580 unsigned long orig_iova = iova; 2581 2581 unsigned int min_pagesz; 2582 2582 size_t orig_size = size; 2583 - phys_addr_t orig_paddr = paddr; 2584 2583 int ret = 0; 2585 2584 2586 2585 might_sleep_if(gfpflags_allow_blocking(gfp)); ··· 2637 2636 /* unroll mapping in case something went wrong */ 2638 2637 if (ret) { 2639 2638 iommu_unmap(domain, orig_iova, orig_size - size); 2640 - } else { 2641 - trace_map(orig_iova, orig_paddr, orig_size); 2642 - iommu_debug_map(domain, orig_paddr, orig_size); 2639 + return ret; 2643 2640 } 2644 - 2645 - return ret; 2641 + return 0; 2646 2642 } 2647 2643 2648 2644 int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) ··· 2649 2651 if (!ops->iotlb_sync_map) 2650 2652 return 0; 2651 2653 return ops->iotlb_sync_map(domain, iova, size); 2654 + } 2655 + 2656 + int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, 2657 + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) 2658 + { 2659 + struct pt_iommu *pt = iommupt_from_domain(domain); 2660 + int ret; 2661 + 2662 + if (pt) { 2663 + size_t mapped = 0; 2664 + 2665 + ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, 2666 + &mapped); 2667 + if (ret) { 2668 + iommu_unmap(domain, iova, mapped); 2669 + return ret; 2670 + } 2671 + return 0; 2672 + } 2673 + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); 2674 + if (!ret) 2675 + return ret; 2676 + 2677 + trace_map(iova, paddr, size); 2678 + iommu_debug_map(domain, paddr, size); 2679 + return 0; 2652 2680 } 2653 2681 2654 2682 int iommu_map(struct iommu_domain *domain, unsigned long iova, ··· 2694 2670 } 2695 2671 EXPORT_SYMBOL_GPL(iommu_map); 2696 2672 2697 - static size_t __iommu_unmap(struct iommu_domain *domain, 2698 - unsigned long iova, size_t size, 2699 - struct iommu_iotlb_gather *iotlb_gather) 2673 + static size_t 2674 + __iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, 2675 + size_t size, struct iommu_iotlb_gather *iotlb_gather) 2700 2676 { 2701 2677 const struct iommu_domain_ops *ops = domain->ops; 2702 2678 size_t unmapped_page, unmapped = 0; 2703 - unsigned long orig_iova = iova; 2704 2679 unsigned int min_pagesz; 2705 2680 2706 2681 if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) ··· 2751 2728 unmapped += unmapped_page; 2752 2729 } 2753 2730 2754 - trace_unmap(orig_iova, size, unmapped); 2755 - iommu_debug_unmap_end(domain, orig_iova, size, unmapped); 2731 + return unmapped; 2732 + } 2733 + 2734 + static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, 2735 + size_t size, 2736 + struct iommu_iotlb_gather *iotlb_gather) 2737 + { 2738 + struct pt_iommu *pt = iommupt_from_domain(domain); 2739 + size_t unmapped; 2740 + 2741 + if (pt) 2742 + unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); 2743 + else 2744 + unmapped = __iommu_unmap_domain_pgtbl(domain, iova, size, 2745 + iotlb_gather); 2746 + trace_unmap(iova, size, unmapped); 2747 + iommu_debug_unmap_end(domain, iova, size, unmapped); 2756 2748 return unmapped; 2757 2749 } 2758 2750

+4

drivers/iommu/iommufd/device.c

··· 1624 1624 if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) 1625 1625 cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; 1626 1626 1627 + /* Report when ATS cannot be used for this device */ 1628 + if (!device_iommu_capable(idev->dev, IOMMU_CAP_PCI_ATS_SUPPORTED)) 1629 + cmd->out_capabilities |= IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED; 1630 + 1627 1631 cmd->out_max_pasid_log2 = 0; 1628 1632 /* 1629 1633 * Currently, all iommu drivers enable PASID in the probe_device()

+2 -1

drivers/iommu/iova.c

··· 611 611 612 612 static void iova_magazine_free(struct iova_magazine *mag) 613 613 { 614 - kmem_cache_free(iova_magazine_cache, mag); 614 + if (mag) 615 + kmem_cache_free(iova_magazine_cache, mag); 615 616 } 616 617 617 618 static void

+6 -2

drivers/iommu/riscv/Kconfig

··· 3 3 4 4 config RISCV_IOMMU 5 5 bool "RISC-V IOMMU Support" 6 - depends on RISCV && 64BIT 7 - default y 6 + default RISCV 7 + depends on GENERIC_MSI_IRQ 8 + depends on (RISCV || COMPILE_TEST) && 64BIT 8 9 select IOMMU_API 10 + select GENERIC_PT 11 + select IOMMU_PT 12 + select IOMMU_PT_RISCV64 9 13 help 10 14 Support for implementations of the RISC-V IOMMU architecture that 11 15 complements the RISC-V MMU capabilities, providing similar address

+3 -1

drivers/iommu/riscv/iommu-bits.h

··· 17 17 #include <linux/types.h> 18 18 #include <linux/bitfield.h> 19 19 #include <linux/bits.h> 20 + #include <asm/page.h> 20 21 21 22 /* 22 23 * Chapter 5: Memory Mapped register interface ··· 719 718 static inline void riscv_iommu_cmd_inval_set_addr(struct riscv_iommu_command *cmd, 720 719 u64 addr) 721 720 { 722 - cmd->dword1 = FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, phys_to_pfn(addr)); 721 + cmd->dword1 = 722 + FIELD_PREP(RISCV_IOMMU_CMD_IOTINVAL_ADDR, PHYS_PFN(addr)); 723 723 cmd->dword0 |= RISCV_IOMMU_CMD_IOTINVAL_AV; 724 724 } 725 725

+11 -6

drivers/iommu/riscv/iommu-platform.c

··· 68 68 iommu->caps = riscv_iommu_readq(iommu, RISCV_IOMMU_REG_CAPABILITIES); 69 69 iommu->fctl = riscv_iommu_readl(iommu, RISCV_IOMMU_REG_FCTL); 70 70 71 - iommu->irqs_count = platform_irq_count(pdev); 72 - if (iommu->irqs_count <= 0) 73 - return dev_err_probe(dev, -ENODEV, 74 - "no IRQ resources provided\n"); 75 - if (iommu->irqs_count > RISCV_IOMMU_INTR_COUNT) 76 - iommu->irqs_count = RISCV_IOMMU_INTR_COUNT; 71 + iommu->irqs_count = RISCV_IOMMU_INTR_COUNT; 77 72 78 73 igs = FIELD_GET(RISCV_IOMMU_CAPABILITIES_IGS, iommu->caps); 79 74 switch (igs) { ··· 115 120 fallthrough; 116 121 117 122 case RISCV_IOMMU_CAPABILITIES_IGS_WSI: 123 + ret = platform_irq_count(pdev); 124 + if (ret <= 0) 125 + return dev_err_probe(dev, -ENODEV, 126 + "no IRQ resources provided\n"); 127 + 128 + iommu->irqs_count = ret; 129 + 130 + if (iommu->irqs_count > RISCV_IOMMU_INTR_COUNT) 131 + iommu->irqs_count = RISCV_IOMMU_INTR_COUNT; 132 + 118 133 for (vec = 0; vec < iommu->irqs_count; vec++) 119 134 iommu->irqs[vec] = platform_get_irq(pdev, vec); 120 135

+123 -263

drivers/iommu/riscv/iommu.c

··· 21 21 #include <linux/iopoll.h> 22 22 #include <linux/kernel.h> 23 23 #include <linux/pci.h> 24 + #include <linux/generic_pt/iommu.h> 24 25 25 26 #include "../iommu-pages.h" 26 27 #include "iommu-bits.h" ··· 160 159 if (FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)) { 161 160 const size_t queue_size = entry_size << (logsz + 1); 162 161 163 - queue->phys = pfn_to_phys(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); 162 + queue->phys = PFN_PHYS(FIELD_GET(RISCV_IOMMU_PPN_FIELD, qb)); 164 163 queue->base = devm_ioremap(iommu->dev, queue->phys, queue_size); 165 164 } else { 166 165 do { ··· 369 368 unsigned int timeout_us) 370 369 { 371 370 unsigned int cons = atomic_read(&queue->head); 371 + unsigned int flags = RISCV_IOMMU_CQCSR_CQMF | RISCV_IOMMU_CQCSR_CMD_TO | 372 + RISCV_IOMMU_CQCSR_CMD_ILL; 372 373 373 374 /* Already processed by the consumer */ 374 375 if ((int)(cons - index) > 0) ··· 378 375 379 376 /* Monitor consumer index */ 380 377 return readx_poll_timeout(riscv_iommu_queue_cons, queue, cons, 378 + (riscv_iommu_readl(queue->iommu, queue->qcr) & flags) || 381 379 (int)(cons - index) > 0, 0, timeout_us); 382 380 } 383 381 ··· 439 435 * 6. Make sure the doorbell write to the device has finished before updating 440 436 * the shadow tail index in normal memory. 'fence o, w' 441 437 */ 438 + #ifdef CONFIG_MMIOWB 442 439 mmiowb(); 440 + #endif 443 441 atomic_inc(&queue->tail); 444 442 445 443 /* 7. Complete submission and restore local interrupts */ ··· 812 806 813 807 /* This struct contains protection domain specific IOMMU driver data. */ 814 808 struct riscv_iommu_domain { 815 - struct iommu_domain domain; 809 + union { 810 + struct iommu_domain domain; 811 + struct pt_iommu_riscv_64 riscvpt; 812 + }; 816 813 struct list_head bonds; 817 814 spinlock_t lock; /* protect bonds list updates. */ 818 815 int pscid; 819 - bool amo_enabled; 820 - int numa_node; 821 - unsigned int pgd_mode; 822 - unsigned long *pgd_root; 823 816 }; 817 + PT_IOMMU_CHECK_DOMAIN(struct riscv_iommu_domain, riscvpt.iommu, domain); 824 818 825 819 #define iommu_domain_to_riscv(iommu_domain) \ 826 820 container_of(iommu_domain, struct riscv_iommu_domain, domain) ··· 934 928 struct riscv_iommu_bond *bond; 935 929 struct riscv_iommu_device *iommu, *prev; 936 930 struct riscv_iommu_command cmd; 937 - unsigned long len = end - start + 1; 938 - unsigned long iova; 939 931 940 932 /* 941 933 * For each IOMMU linked with this protection domain (via bonds->dev), ··· 976 972 977 973 riscv_iommu_cmd_inval_vma(&cmd); 978 974 riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid); 979 - if (len && len < RISCV_IOMMU_IOTLB_INVAL_LIMIT) { 980 - for (iova = start; iova < end; iova += PAGE_SIZE) { 975 + if (end - start < RISCV_IOMMU_IOTLB_INVAL_LIMIT - 1) { 976 + unsigned long iova = start; 977 + 978 + do { 981 979 riscv_iommu_cmd_inval_set_addr(&cmd, iova); 982 980 riscv_iommu_cmd_send(iommu, &cmd); 983 - } 981 + } while (!check_add_overflow(iova, PAGE_SIZE, &iova) && 982 + iova < end); 984 983 } else { 985 984 riscv_iommu_cmd_send(iommu, &cmd); 986 985 } ··· 1003 996 } 1004 997 1005 998 #define RISCV_IOMMU_FSC_BARE 0 999 + /* 1000 + * This function sends IOTINVAL commands as required by the RISC-V 1001 + * IOMMU specification (Section 6.3.1 and 6.3.2 in 1.0 spec version) 1002 + * after modifying DDT or PDT entries 1003 + */ 1004 + static void riscv_iommu_iodir_iotinval(struct riscv_iommu_device *iommu, 1005 + bool inval_pdt, unsigned long iohgatp, 1006 + struct riscv_iommu_dc *dc, 1007 + struct riscv_iommu_pc *pc) 1008 + { 1009 + struct riscv_iommu_command cmd; 1006 1010 1011 + riscv_iommu_cmd_inval_vma(&cmd); 1012 + 1013 + if (FIELD_GET(RISCV_IOMMU_DC_IOHGATP_MODE, iohgatp) == 1014 + RISCV_IOMMU_DC_IOHGATP_MODE_BARE) { 1015 + if (inval_pdt) { 1016 + /* 1017 + * IOTINVAL.VMA with GV=AV=0, and PSCV=1, and 1018 + * PSCID=PC.PSCID 1019 + */ 1020 + riscv_iommu_cmd_inval_set_pscid(&cmd, 1021 + FIELD_GET(RISCV_IOMMU_PC_TA_PSCID, pc->ta)); 1022 + } else { 1023 + if (!FIELD_GET(RISCV_IOMMU_DC_TC_PDTV, dc->tc) && 1024 + FIELD_GET(RISCV_IOMMU_DC_FSC_MODE, dc->fsc) != 1025 + RISCV_IOMMU_DC_FSC_MODE_BARE) { 1026 + /* 1027 + * DC.tc.PDTV == 0 && DC.fsc.MODE != Bare 1028 + * IOTINVAL.VMA with GV=AV=0, and PSCV=1, and 1029 + * PSCID=DC.ta.PSCID 1030 + */ 1031 + riscv_iommu_cmd_inval_set_pscid(&cmd, 1032 + FIELD_GET(RISCV_IOMMU_DC_TA_PSCID, dc->ta)); 1033 + } 1034 + /* else: IOTINVAL.VMA with GV=AV=PSCV=0 */ 1035 + } 1036 + } else { 1037 + riscv_iommu_cmd_inval_set_gscid(&cmd, 1038 + FIELD_GET(RISCV_IOMMU_DC_IOHGATP_GSCID, iohgatp)); 1039 + 1040 + if (inval_pdt) { 1041 + /* 1042 + * IOTINVAL.VMA with GV=1, AV=0, and PSCV=1, and 1043 + * GSCID=DC.iohgatp.GSCID, PSCID=PC.PSCID 1044 + */ 1045 + riscv_iommu_cmd_inval_set_pscid(&cmd, 1046 + FIELD_GET(RISCV_IOMMU_PC_TA_PSCID, pc->ta)); 1047 + } 1048 + /* 1049 + * else: IOTINVAL.VMA with GV=1,AV=PSCV=0,and 1050 + * GSCID=DC.iohgatp.GSCID 1051 + * 1052 + * IOTINVAL.GVMA with GV=1,AV=0,and 1053 + * GSCID=DC.iohgatp.GSCID 1054 + * TODO: For now, the Second-Stage feature have not yet been merged, 1055 + * also issue IOTINVAL.GVMA once second-stage support is merged. 1056 + */ 1057 + } 1058 + riscv_iommu_cmd_send(iommu, &cmd); 1059 + } 1007 1060 /* 1008 1061 * Update IODIR for the device. 1009 1062 * ··· 1098 1031 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1099 1032 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1100 1033 riscv_iommu_cmd_send(iommu, &cmd); 1034 + /* 1035 + * For now, the SVA and PASID features have not yet been merged, the 1036 + * default configuration is inval_pdt=false and pc=NULL. 1037 + */ 1038 + riscv_iommu_iodir_iotinval(iommu, false, dc->iohgatp, dc, NULL); 1101 1039 sync_required = true; 1102 1040 } 1103 1041 ··· 1128 1056 riscv_iommu_cmd_iodir_inval_ddt(&cmd); 1129 1057 riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]); 1130 1058 riscv_iommu_cmd_send(iommu, &cmd); 1059 + /* 1060 + * For now, the SVA and PASID features have not yet been merged, the 1061 + * default configuration is inval_pdt=false and pc=NULL. 1062 + */ 1063 + riscv_iommu_iodir_iotinval(iommu, false, dc->iohgatp, dc, NULL); 1131 1064 } 1132 1065 1133 1066 riscv_iommu_cmd_sync(iommu, RISCV_IOMMU_IOTINVAL_TIMEOUT); ··· 1154 1077 { 1155 1078 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1156 1079 1157 - riscv_iommu_iotlb_inval(domain, gather->start, gather->end); 1158 - } 1159 - 1160 - #define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t))) 1161 - 1162 - #define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)) 1163 - #define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF) 1164 - #define _io_pte_none(pte) ((pte) == 0) 1165 - #define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)) 1166 - 1167 - static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain, 1168 - unsigned long pte, 1169 - struct iommu_pages_list *freelist) 1170 - { 1171 - unsigned long *ptr; 1172 - int i; 1173 - 1174 - if (!_io_pte_present(pte) || _io_pte_leaf(pte)) 1175 - return; 1176 - 1177 - ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1178 - 1179 - /* Recursively free all sub page table pages */ 1180 - for (i = 0; i < PTRS_PER_PTE; i++) { 1181 - pte = READ_ONCE(ptr[i]); 1182 - if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte) 1183 - riscv_iommu_pte_free(domain, pte, freelist); 1184 - } 1185 - 1186 - if (freelist) 1187 - iommu_pages_list_add(freelist, ptr); 1188 - else 1189 - iommu_free_pages(ptr); 1190 - } 1191 - 1192 - static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain, 1193 - unsigned long iova, size_t pgsize, 1194 - gfp_t gfp) 1195 - { 1196 - unsigned long *ptr = domain->pgd_root; 1197 - unsigned long pte, old; 1198 - int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1199 - void *addr; 1200 - 1201 - do { 1202 - const int shift = PAGE_SHIFT + PT_SHIFT * level; 1203 - 1204 - ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1205 - /* 1206 - * Note: returned entry might be a non-leaf if there was 1207 - * existing mapping with smaller granularity. Up to the caller 1208 - * to replace and invalidate. 1209 - */ 1210 - if (((size_t)1 << shift) == pgsize) 1211 - return ptr; 1212 - pte_retry: 1213 - pte = READ_ONCE(*ptr); 1214 - /* 1215 - * This is very likely incorrect as we should not be adding 1216 - * new mapping with smaller granularity on top 1217 - * of existing 2M/1G mapping. Fail. 1218 - */ 1219 - if (_io_pte_present(pte) && _io_pte_leaf(pte)) 1220 - return NULL; 1221 - /* 1222 - * Non-leaf entry is missing, allocate and try to add to the 1223 - * page table. This might race with other mappings, retry. 1224 - */ 1225 - if (_io_pte_none(pte)) { 1226 - addr = iommu_alloc_pages_node_sz(domain->numa_node, gfp, 1227 - SZ_4K); 1228 - if (!addr) 1229 - return NULL; 1230 - old = pte; 1231 - pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE); 1232 - if (cmpxchg_relaxed(ptr, old, pte) != old) { 1233 - iommu_free_pages(addr); 1234 - goto pte_retry; 1235 - } 1236 - } 1237 - ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1238 - } while (level-- > 0); 1239 - 1240 - return NULL; 1241 - } 1242 - 1243 - static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain, 1244 - unsigned long iova, size_t *pte_pgsize) 1245 - { 1246 - unsigned long *ptr = domain->pgd_root; 1247 - unsigned long pte; 1248 - int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2; 1249 - 1250 - do { 1251 - const int shift = PAGE_SHIFT + PT_SHIFT * level; 1252 - 1253 - ptr += ((iova >> shift) & (PTRS_PER_PTE - 1)); 1254 - pte = READ_ONCE(*ptr); 1255 - if (_io_pte_present(pte) && _io_pte_leaf(pte)) { 1256 - *pte_pgsize = (size_t)1 << shift; 1257 - return ptr; 1258 - } 1259 - if (_io_pte_none(pte)) 1260 - return NULL; 1261 - ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte)); 1262 - } while (level-- > 0); 1263 - 1264 - return NULL; 1265 - } 1266 - 1267 - static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain, 1268 - unsigned long iova, phys_addr_t phys, 1269 - size_t pgsize, size_t pgcount, int prot, 1270 - gfp_t gfp, size_t *mapped) 1271 - { 1272 - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1273 - size_t size = 0; 1274 - unsigned long *ptr; 1275 - unsigned long pte, old, pte_prot; 1276 - int rc = 0; 1277 - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 1278 - 1279 - if (!(prot & IOMMU_WRITE)) 1280 - pte_prot = _PAGE_BASE | _PAGE_READ; 1281 - else if (domain->amo_enabled) 1282 - pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE; 1283 - else 1284 - pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY; 1285 - 1286 - while (pgcount) { 1287 - ptr = riscv_iommu_pte_alloc(domain, iova, pgsize, gfp); 1288 - if (!ptr) { 1289 - rc = -ENOMEM; 1290 - break; 1291 - } 1292 - 1293 - old = READ_ONCE(*ptr); 1294 - pte = _io_pte_entry(phys_to_pfn(phys), pte_prot); 1295 - if (cmpxchg_relaxed(ptr, old, pte) != old) 1296 - continue; 1297 - 1298 - riscv_iommu_pte_free(domain, old, &freelist); 1299 - 1300 - size += pgsize; 1301 - iova += pgsize; 1302 - phys += pgsize; 1303 - --pgcount; 1304 - } 1305 - 1306 - *mapped = size; 1307 - 1308 - if (!iommu_pages_list_empty(&freelist)) { 1080 + if (iommu_pages_list_empty(&gather->freelist)) { 1081 + riscv_iommu_iotlb_inval(domain, gather->start, gather->end); 1082 + } else { 1309 1083 /* 1310 1084 * In 1.0 spec version, the smallest scope we can use to 1311 1085 * invalidate all levels of page table (i.e. leaf and non-leaf) ··· 1165 1237 * capability.NL (non-leaf) IOTINVAL command. 1166 1238 */ 1167 1239 riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX); 1168 - iommu_put_pages_list(&freelist); 1240 + iommu_put_pages_list(&gather->freelist); 1169 1241 } 1170 - 1171 - return rc; 1172 - } 1173 - 1174 - static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain, 1175 - unsigned long iova, size_t pgsize, 1176 - size_t pgcount, 1177 - struct iommu_iotlb_gather *gather) 1178 - { 1179 - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1180 - size_t size = pgcount << __ffs(pgsize); 1181 - unsigned long *ptr, old; 1182 - size_t unmapped = 0; 1183 - size_t pte_size; 1184 - 1185 - while (unmapped < size) { 1186 - ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1187 - if (!ptr) 1188 - return unmapped; 1189 - 1190 - /* partial unmap is not allowed, fail. */ 1191 - if (iova & (pte_size - 1)) 1192 - return unmapped; 1193 - 1194 - old = READ_ONCE(*ptr); 1195 - if (cmpxchg_relaxed(ptr, old, 0) != old) 1196 - continue; 1197 - 1198 - iommu_iotlb_gather_add_page(&domain->domain, gather, iova, 1199 - pte_size); 1200 - 1201 - iova += pte_size; 1202 - unmapped += pte_size; 1203 - } 1204 - 1205 - return unmapped; 1206 - } 1207 - 1208 - static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, 1209 - dma_addr_t iova) 1210 - { 1211 - struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1212 - size_t pte_size; 1213 - unsigned long *ptr; 1214 - 1215 - ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size); 1216 - if (!ptr) 1217 - return 0; 1218 - 1219 - return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1)); 1220 1242 } 1221 1243 1222 1244 static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain) 1223 1245 { 1224 1246 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1225 - const unsigned long pfn = virt_to_pfn(domain->pgd_root); 1226 1247 1227 1248 WARN_ON(!list_empty(&domain->bonds)); 1228 1249 1229 1250 if ((int)domain->pscid > 0) 1230 1251 ida_free(&riscv_iommu_pscids, domain->pscid); 1231 1252 1232 - riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL); 1253 + pt_iommu_deinit(&domain->riscvpt.iommu); 1233 1254 kfree(domain); 1234 1255 } 1235 1256 ··· 1204 1327 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1205 1328 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1206 1329 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); 1330 + struct pt_iommu_riscv_64_hw_info pt_info; 1207 1331 u64 fsc, ta; 1208 1332 1209 - if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) 1333 + pt_iommu_riscv_64_hw_info(&domain->riscvpt, &pt_info); 1334 + 1335 + if (!riscv_iommu_pt_supported(iommu, pt_info.fsc_iosatp_mode)) 1210 1336 return -ENODEV; 1211 1337 1212 - fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) | 1213 - FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root)); 1338 + fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, pt_info.fsc_iosatp_mode) | 1339 + FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, pt_info.ppn); 1214 1340 ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | 1215 1341 RISCV_IOMMU_PC_TA_V; 1216 1342 ··· 1228 1348 } 1229 1349 1230 1350 static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = { 1351 + IOMMU_PT_DOMAIN_OPS(riscv_64), 1231 1352 .attach_dev = riscv_iommu_attach_paging_domain, 1232 1353 .free = riscv_iommu_free_paging_domain, 1233 - .map_pages = riscv_iommu_map_pages, 1234 - .unmap_pages = riscv_iommu_unmap_pages, 1235 - .iova_to_phys = riscv_iommu_iova_to_phys, 1236 1354 .iotlb_sync = riscv_iommu_iotlb_sync, 1237 1355 .flush_iotlb_all = riscv_iommu_iotlb_flush_all, 1238 1356 }; 1239 1357 1240 1358 static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev) 1241 1359 { 1360 + struct pt_iommu_riscv_64_cfg cfg = {}; 1242 1361 struct riscv_iommu_domain *domain; 1243 1362 struct riscv_iommu_device *iommu; 1244 - unsigned int pgd_mode; 1245 - dma_addr_t va_mask; 1246 - int va_bits; 1363 + int ret; 1247 1364 1248 1365 iommu = dev_to_iommu(dev); 1249 1366 if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) { 1250 - pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57; 1251 - va_bits = 57; 1367 + cfg.common.hw_max_vasz_lg2 = 57; 1252 1368 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV48) { 1253 - pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48; 1254 - va_bits = 48; 1369 + cfg.common.hw_max_vasz_lg2 = 48; 1255 1370 } else if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV39) { 1256 - pgd_mode = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39; 1257 - va_bits = 39; 1371 + cfg.common.hw_max_vasz_lg2 = 39; 1258 1372 } else { 1259 1373 dev_err(dev, "cannot find supported page table mode\n"); 1260 1374 return ERR_PTR(-ENODEV); 1261 1375 } 1376 + cfg.common.hw_max_oasz_lg2 = 56; 1262 1377 1263 1378 domain = kzalloc_obj(*domain); 1264 1379 if (!domain) ··· 1261 1386 1262 1387 INIT_LIST_HEAD_RCU(&domain->bonds); 1263 1388 spin_lock_init(&domain->lock); 1264 - domain->numa_node = dev_to_node(iommu->dev); 1265 - domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD); 1266 - domain->pgd_mode = pgd_mode; 1267 - domain->pgd_root = iommu_alloc_pages_node_sz(domain->numa_node, 1268 - GFP_KERNEL_ACCOUNT, SZ_4K); 1269 - if (!domain->pgd_root) { 1270 - kfree(domain); 1271 - return ERR_PTR(-ENOMEM); 1272 - } 1389 + /* 1390 + * 6.4 IOMMU capabilities [..] IOMMU implementations must support the 1391 + * Svnapot standard extension for NAPOT Translation Contiguity. 1392 + */ 1393 + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | 1394 + BIT(PT_FEAT_FLUSH_RANGE) | 1395 + BIT(PT_FEAT_RISCV_SVNAPOT_64K); 1396 + domain->riscvpt.iommu.nid = dev_to_node(iommu->dev); 1397 + domain->domain.ops = &riscv_iommu_paging_domain_ops; 1273 1398 1274 1399 domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, 1275 1400 RISCV_IOMMU_MAX_PSCID, GFP_KERNEL); 1276 1401 if (domain->pscid < 0) { 1277 - iommu_free_pages(domain->pgd_root); 1278 - kfree(domain); 1402 + riscv_iommu_free_paging_domain(&domain->domain); 1279 1403 return ERR_PTR(-ENOMEM); 1280 1404 } 1281 1405 1282 - /* 1283 - * Note: RISC-V Privilege spec mandates that virtual addresses 1284 - * need to be sign-extended, so if (VA_BITS - 1) is set, all 1285 - * bits >= VA_BITS need to also be set or else we'll get a 1286 - * page fault. However the code that creates the mappings 1287 - * above us (e.g. iommu_dma_alloc_iova()) won't do that for us 1288 - * for now, so we'll end up with invalid virtual addresses 1289 - * to map. As a workaround until we get this sorted out 1290 - * limit the available virtual addresses to VA_BITS - 1. 1291 - */ 1292 - va_mask = DMA_BIT_MASK(va_bits - 1); 1293 - 1294 - domain->domain.geometry.aperture_start = 0; 1295 - domain->domain.geometry.aperture_end = va_mask; 1296 - domain->domain.geometry.force_aperture = true; 1297 - domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G); 1298 - 1299 - domain->domain.ops = &riscv_iommu_paging_domain_ops; 1300 - 1406 + ret = pt_iommu_riscv_64_init(&domain->riscvpt, &cfg, GFP_KERNEL); 1407 + if (ret) { 1408 + riscv_iommu_free_paging_domain(&domain->domain); 1409 + return ERR_PTR(ret); 1410 + } 1301 1411 return &domain->domain; 1302 1412 } 1303 1413 ··· 1372 1512 * the device directory. Do not mark the context valid yet. 1373 1513 */ 1374 1514 tc = 0; 1375 - if (iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD) 1376 - tc |= RISCV_IOMMU_DC_TC_SADE; 1377 1515 for (i = 0; i < fwspec->num_ids; i++) { 1378 1516 dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]); 1379 1517 if (!dc) { ··· 1538 1680 riscv_iommu_queue_disable(&iommu->cmdq); 1539 1681 return rc; 1540 1682 } 1683 + 1684 + MODULE_IMPORT_NS("GENERIC_PT_IOMMU");

+16

include/linux/generic_pt/common.h

··· 175 175 PT_FEAT_VTDSS_FORCE_WRITEABLE, 176 176 }; 177 177 178 + struct pt_riscv_32 { 179 + struct pt_common common; 180 + }; 181 + 182 + struct pt_riscv_64 { 183 + struct pt_common common; 184 + }; 185 + 186 + enum { 187 + /* 188 + * Support the 64k contiguous page size following the Svnapot extension. 189 + */ 190 + PT_FEAT_RISCV_SVNAPOT_64K = PT_FEAT_FMT_START, 191 + 192 + }; 193 + 178 194 struct pt_x86_64 { 179 195 struct pt_common common; 180 196 };

+69 -11

include/linux/generic_pt/iommu.h

··· 66 66 struct device *iommu_device; 67 67 }; 68 68 69 + static inline struct pt_iommu *iommupt_from_domain(struct iommu_domain *domain) 70 + { 71 + if (!IS_ENABLED(CONFIG_IOMMU_PT) || !domain->is_iommupt) 72 + return NULL; 73 + return container_of(domain, struct pt_iommu, domain); 74 + } 75 + 69 76 /** 70 77 * struct pt_iommu_info - Details about the IOMMU page table 71 78 * ··· 87 80 }; 88 81 89 82 struct pt_iommu_ops { 83 + /** 84 + * @map_range: Install translation for an IOVA range 85 + * @iommu_table: Table to manipulate 86 + * @iova: IO virtual address to start 87 + * @paddr: Physical/Output address to start 88 + * @len: Length of the range starting from @iova 89 + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 90 + * @gfp: GFP flags for any memory allocations 91 + * 92 + * The range starting at IOVA will have paddr installed into it. The 93 + * rage is automatically segmented into optimally sized table entries, 94 + * and can have any valid alignment. 95 + * 96 + * On error the caller will probably want to invoke unmap on the range 97 + * from iova up to the amount indicated by @mapped to return the table 98 + * back to an unchanged state. 99 + * 100 + * Context: The caller must hold a write range lock that includes 101 + * the whole range. 102 + * 103 + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA 104 + * that were mapped are added to @mapped, @mapped is not zerod first. 105 + */ 106 + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 107 + phys_addr_t paddr, dma_addr_t len, unsigned int prot, 108 + gfp_t gfp, size_t *mapped); 109 + 110 + /** 111 + * @unmap_range: Make a range of IOVA empty/not present 112 + * @iommu_table: Table to manipulate 113 + * @iova: IO virtual address to start 114 + * @len: Length of the range starting from @iova 115 + * @iotlb_gather: Gather struct that must be flushed on return 116 + * 117 + * unmap_range() will remove a translation created by map_range(). It 118 + * cannot subdivide a mapping created by map_range(), so it should be 119 + * called with IOVA ranges that match those passed to map_pages. The 120 + * IOVA range can aggregate contiguous map_range() calls so long as no 121 + * individual range is split. 122 + * 123 + * Context: The caller must hold a write range lock that includes 124 + * the whole range. 125 + * 126 + * Returns: Number of bytes of VA unmapped. iova + res will be the 127 + * point unmapping stopped. 128 + */ 129 + size_t (*unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, 130 + dma_addr_t len, 131 + struct iommu_iotlb_gather *iotlb_gather); 132 + 90 133 /** 91 134 * @set_dirty: Make the iova write dirty 92 135 * @iommu_table: Table to manipulate ··· 251 194 #define IOMMU_PROTOTYPES(fmt) \ 252 195 phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ 253 196 dma_addr_t iova); \ 254 - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ 255 - unsigned long iova, phys_addr_t paddr, \ 256 - size_t pgsize, size_t pgcount, \ 257 - int prot, gfp_t gfp, size_t *mapped); \ 258 - size_t pt_iommu_##fmt##_unmap_pages( \ 259 - struct iommu_domain *domain, unsigned long iova, \ 260 - size_t pgsize, size_t pgcount, \ 261 - struct iommu_iotlb_gather *iotlb_gather); \ 262 197 int pt_iommu_##fmt##_read_and_clear_dirty( \ 263 198 struct iommu_domain *domain, unsigned long iova, size_t size, \ 264 199 unsigned long flags, struct iommu_dirty_bitmap *dirty); \ ··· 271 222 * iommu_pt 272 223 */ 273 224 #define IOMMU_PT_DOMAIN_OPS(fmt) \ 274 - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ 275 - .map_pages = &pt_iommu_##fmt##_map_pages, \ 276 - .unmap_pages = &pt_iommu_##fmt##_unmap_pages 225 + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys 277 226 #define IOMMU_PT_DIRTY_OPS(fmt) \ 278 227 .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty 279 228 ··· 321 274 }; 322 275 323 276 IOMMU_FORMAT(vtdss, vtdss_pt); 277 + 278 + struct pt_iommu_riscv_64_cfg { 279 + struct pt_iommu_cfg common; 280 + }; 281 + 282 + struct pt_iommu_riscv_64_hw_info { 283 + u64 ppn; 284 + u8 fsc_iosatp_mode; 285 + }; 286 + 287 + IOMMU_FORMAT(riscv_64, riscv_64pt); 324 288 325 289 struct pt_iommu_x86_64_cfg { 326 290 struct pt_iommu_cfg common;

+3

include/linux/iommu.h

··· 223 223 struct iommu_domain { 224 224 unsigned type; 225 225 enum iommu_domain_cookie_type cookie_type; 226 + bool is_iommupt; 226 227 const struct iommu_domain_ops *ops; 227 228 const struct iommu_dirty_ops *dirty_ops; 228 229 const struct iommu_ops *owner; /* Whose domain_alloc we came from */ ··· 272 271 */ 273 272 IOMMU_CAP_DEFERRED_FLUSH, 274 273 IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ 274 + /* ATS is supported and may be enabled for this device */ 275 + IOMMU_CAP_PCI_ATS_SUPPORTED, 275 276 }; 276 277 277 278 /* These are the possible reserved region types */

+9

include/uapi/linux/iommufd.h

··· 695 695 * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it 696 696 * when the struct 697 697 * iommu_hw_info::out_max_pasid_log2 is zero. 698 + * @IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED: ATS is not supported or cannot be used 699 + * on this device (absence implies ATS 700 + * may be enabled) 698 701 */ 699 702 enum iommufd_hw_capabilities { 700 703 IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, 701 704 IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, 702 705 IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, 706 + IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED = 1 << 3, 703 707 }; 704 708 705 709 /** ··· 1056 1052 enum iommu_viommu_type { 1057 1053 IOMMU_VIOMMU_TYPE_DEFAULT = 0, 1058 1054 IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, 1055 + /* 1056 + * TEGRA241_CMDQV requirements (otherwise, VCMDQs will not work) 1057 + * - Kernel will allocate a VINTF (HYP_OWN=0) to back this VIOMMU. So, 1058 + * VMM must wire the HYP_OWN bit to 0 in guest VINTF_CONFIG register 1059 + */ 1059 1060 IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2, 1060 1061 }; 1061 1062

+27

tools/testing/selftests/iommu/iommufd.c

··· 2275 2275 test_ioctl_destroy(hwpt_id); 2276 2276 } 2277 2277 2278 + TEST_F(iommufd_dirty_tracking, pasid_set_dirty_tracking) 2279 + { 2280 + uint32_t stddev_id, ioas_id, hwpt_id, pasid = 100; 2281 + uint32_t dev_flags = MOCK_FLAGS_DEVICE_PASID; 2282 + 2283 + /* Regular case */ 2284 + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 2285 + IOMMU_HWPT_ALLOC_PASID | IOMMU_HWPT_ALLOC_DIRTY_TRACKING, 2286 + &hwpt_id); 2287 + test_cmd_mock_domain_flags(hwpt_id, dev_flags, &stddev_id, NULL, NULL); 2288 + ASSERT_EQ(0, _test_cmd_pasid_attach(self->fd, stddev_id, pasid, hwpt_id)); 2289 + test_cmd_set_dirty_tracking(hwpt_id, true); 2290 + test_cmd_set_dirty_tracking(hwpt_id, false); 2291 + ASSERT_EQ(0, _test_cmd_pasid_detach(self->fd, stddev_id, pasid)); 2292 + 2293 + test_ioctl_destroy(stddev_id); 2294 + 2295 + /* IOMMU device does not support dirty tracking */ 2296 + dev_flags |= MOCK_FLAGS_DEVICE_NO_DIRTY; 2297 + test_ioctl_ioas_alloc(&ioas_id); 2298 + test_cmd_mock_domain_flags(ioas_id, dev_flags, &stddev_id, NULL, NULL); 2299 + EXPECT_ERRNO(EINVAL, _test_cmd_pasid_attach(self->fd, stddev_id, pasid, hwpt_id)); 2300 + 2301 + test_ioctl_destroy(stddev_id); 2302 + test_ioctl_destroy(hwpt_id); 2303 + } 2304 + 2278 2305 TEST_F(iommufd_dirty_tracking, device_dirty_capability) 2279 2306 { 2280 2307 uint32_t caps = 0;

Configure Feed

Configure Feed