Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

iommupt: Add the Intel VT-d second stage page table format

The VT-d second stage format is almost the same as the x86 PAE format,
except the bit encodings in the PTE are different and a few new PTE
features, like force coherency are present.

Among all the formats it is unique in not having a designated present bit.

Comparing the performance of several operations to the existing version:

iommu_map()
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 53,66 , 50,64 , 21.21
2^21, 59,70 , 56,67 , 16.16
2^30, 54,66 , 52,63 , 17.17
256*2^12, 384,524 , 337,516 , 34.34
256*2^21, 387,632 , 336,626 , 46.46
256*2^30, 376,629 , 323,623 , 48.48

iommu_unmap()
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 67,86 , 63,84 , 25.25
2^21, 64,84 , 59,80 , 26.26
2^30, 59,78 , 56,74 , 24.24
256*2^12, 216,335 , 198,317 , 37.37
256*2^21, 245,350 , 232,344 , 32.32
256*2^30, 248,345 , 226,339 , 33.33

Cc: Tina Zhang <tina.zhang@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

authored by

Jason Gunthorpe and committed by
Joerg Roedel
5448c155 efa03dab

+366
+1
drivers/iommu/generic_pt/.kunitconfig
··· 3 3 CONFIG_DEBUG_GENERIC_PT=y 4 4 CONFIG_IOMMU_PT=y 5 5 CONFIG_IOMMU_PT_AMDV1=y 6 + CONFIG_IOMMU_PT_VTDSS=y 6 7 CONFIG_IOMMU_PT_X86_64=y 7 8 CONFIG_IOMMU_PT_KUNIT_TEST=y 8 9
+11
drivers/iommu/generic_pt/Kconfig
··· 42 42 43 43 Selected automatically by an IOMMU driver that uses this format. 44 44 45 + config IOMMU_PT_VTDSS 46 + tristate "IOMMU page table for Intel VT-d Second Stage" 47 + depends on !GENERIC_ATOMIC64 # for cmpxchg64 48 + help 49 + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 50 + level Second Stage page table. It is similar to the X86_64 format with 51 + 4K/2M/1G page sizes. 52 + 53 + Selected automatically by an IOMMU driver that uses this format. 54 + 45 55 config IOMMU_PT_X86_64 46 56 tristate "IOMMU page table for x86 64-bit, 4/5 levels" 47 57 depends on !GENERIC_ATOMIC64 # for cmpxchg64 ··· 67 57 depends on KUNIT 68 58 depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 69 59 depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 60 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS 70 61 default KUNIT_ALL_TESTS 71 62 help 72 63 Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
+2
drivers/iommu/generic_pt/fmt/Makefile
··· 3 3 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 4 4 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock 5 5 6 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss 7 + 6 8 iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 7 9 8 10 IOMMU_PT_KUNIT_TEST :=
+21
drivers/iommu/generic_pt/fmt/defs_vtdss.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + */ 6 + #ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H 7 + #define __GENERIC_PT_FMT_DEFS_VTDSS_H 8 + 9 + #include <linux/generic_pt/common.h> 10 + #include <linux/types.h> 11 + 12 + typedef u64 pt_vaddr_t; 13 + typedef u64 pt_oaddr_t; 14 + 15 + struct vtdss_pt_write_attrs { 16 + u64 descriptor_bits; 17 + gfp_t gfp; 18 + }; 19 + #define pt_write_attrs vtdss_pt_write_attrs 20 + 21 + #endif
+10
drivers/iommu/generic_pt/fmt/iommu_vtdss.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define PT_FMT vtdss 6 + #define PT_SUPPORTED_FEATURES \ 7 + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ 8 + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) 9 + 10 + #include "iommu_template.h"
+292
drivers/iommu/generic_pt/fmt/vtdss.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Intel VT-d Second Stange 5/4 level page table 6 + * 7 + * This is described in 8 + * Section "3.7 Second-Stage Translation" 9 + * Section "9.8 Second-Stage Paging Entries" 10 + * 11 + * Of the "Intel Virtualization Technology for Directed I/O Architecture 12 + * Specification". 13 + * 14 + * The named levels in the spec map to the pts->level as: 15 + * Table/SS-PTE - 0 16 + * Directory/SS-PDE - 1 17 + * Directory Ptr/SS-PDPTE - 2 18 + * PML4/SS-PML4E - 3 19 + * PML5/SS-PML5E - 4 20 + */ 21 + #ifndef __GENERIC_PT_FMT_VTDSS_H 22 + #define __GENERIC_PT_FMT_VTDSS_H 23 + 24 + #include "defs_vtdss.h" 25 + #include "../pt_defs.h" 26 + 27 + #include <linux/bitfield.h> 28 + #include <linux/container_of.h> 29 + #include <linux/log2.h> 30 + 31 + enum { 32 + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, 33 + PT_MAX_VA_ADDRESS_LG2 = 57, 34 + PT_ITEM_WORD_SIZE = sizeof(u64), 35 + PT_MAX_TOP_LEVEL = 4, 36 + PT_GRANULE_LG2SZ = 12, 37 + PT_TABLEMEM_LG2SZ = 12, 38 + 39 + /* SSPTPTR is 4k aligned and limited by HAW */ 40 + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), 41 + }; 42 + 43 + /* Shared descriptor bits */ 44 + enum { 45 + VTDSS_FMT_R = BIT(0), 46 + VTDSS_FMT_W = BIT(1), 47 + VTDSS_FMT_A = BIT(8), 48 + VTDSS_FMT_D = BIT(9), 49 + VTDSS_FMT_SNP = BIT(11), 50 + VTDSS_FMT_OA = GENMASK_ULL(51, 12), 51 + }; 52 + 53 + /* PDPTE/PDE */ 54 + enum { 55 + VTDSS_FMT_PS = BIT(7), 56 + }; 57 + 58 + #define common_to_vtdss_pt(common_ptr) \ 59 + container_of_const(common_ptr, struct pt_vtdss, common) 60 + #define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) 61 + 62 + static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) 63 + { 64 + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), 65 + PT_TABLEMEM_LG2SZ); 66 + } 67 + #define pt_table_pa vtdss_pt_table_pa 68 + 69 + static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) 70 + { 71 + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), 72 + PT_GRANULE_LG2SZ); 73 + } 74 + #define pt_entry_oa vtdss_pt_entry_oa 75 + 76 + static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) 77 + { 78 + return pts->level <= 2; 79 + } 80 + #define pt_can_have_leaf vtdss_pt_can_have_leaf 81 + 82 + static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) 83 + { 84 + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 85 + } 86 + #define pt_num_items_lg2 vtdss_pt_num_items_lg2 87 + 88 + static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) 89 + { 90 + const u64 *tablep = pt_cur_table(pts, u64); 91 + u64 entry; 92 + 93 + pts->entry = entry = READ_ONCE(tablep[pts->index]); 94 + if (!entry) 95 + return PT_ENTRY_EMPTY; 96 + if (pts->level == 0 || 97 + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) 98 + return PT_ENTRY_OA; 99 + return PT_ENTRY_TABLE; 100 + } 101 + #define pt_load_entry_raw vtdss_pt_load_entry_raw 102 + 103 + static inline void 104 + vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 105 + unsigned int oasz_lg2, 106 + const struct pt_write_attrs *attrs) 107 + { 108 + u64 *tablep = pt_cur_table(pts, u64); 109 + u64 entry; 110 + 111 + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 112 + return; 113 + 114 + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | 115 + attrs->descriptor_bits; 116 + if (pts->level != 0) 117 + entry |= VTDSS_FMT_PS; 118 + 119 + WRITE_ONCE(tablep[pts->index], entry); 120 + pts->entry = entry; 121 + } 122 + #define pt_install_leaf_entry vtdss_pt_install_leaf_entry 123 + 124 + static inline bool vtdss_pt_install_table(struct pt_state *pts, 125 + pt_oaddr_t table_pa, 126 + const struct pt_write_attrs *attrs) 127 + { 128 + u64 entry; 129 + 130 + entry = VTDSS_FMT_R | VTDSS_FMT_W | 131 + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); 132 + return pt_table_install64(pts, entry); 133 + } 134 + #define pt_install_table vtdss_pt_install_table 135 + 136 + static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, 137 + struct pt_write_attrs *attrs) 138 + { 139 + attrs->descriptor_bits = pts->entry & 140 + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); 141 + } 142 + #define pt_attr_from_entry vtdss_pt_attr_from_entry 143 + 144 + static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) 145 + { 146 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 147 + 148 + return READ_ONCE(*tablep) & VTDSS_FMT_D; 149 + } 150 + #define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty 151 + 152 + static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) 153 + { 154 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 155 + 156 + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); 157 + } 158 + #define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean 159 + 160 + static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) 161 + { 162 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 163 + u64 new = pts->entry | VTDSS_FMT_D; 164 + 165 + return try_cmpxchg64(tablep, &pts->entry, new); 166 + } 167 + #define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty 168 + 169 + static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) 170 + { 171 + return 10; 172 + } 173 + #define pt_max_sw_bit vtdss_pt_max_sw_bit 174 + 175 + static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) 176 + { 177 + /* Bits marked Ignored in the specification */ 178 + switch (bitnr) { 179 + case 0: 180 + return BIT(10); 181 + case 1 ... 9: 182 + return BIT_ULL((bitnr - 1) + 52); 183 + case 10: 184 + return BIT_ULL(63); 185 + /* Some bits in 9-3 are available in some entries */ 186 + default: 187 + if (__builtin_constant_p(bitnr)) 188 + BUILD_BUG(); 189 + else 190 + PT_WARN_ON(true); 191 + return 0; 192 + } 193 + } 194 + #define pt_sw_bit vtdss_pt_sw_bit 195 + 196 + /* --- iommu */ 197 + #include <linux/generic_pt/iommu.h> 198 + #include <linux/iommu.h> 199 + 200 + #define pt_iommu_table pt_iommu_vtdss 201 + 202 + /* The common struct is in the per-format common struct */ 203 + static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 204 + { 205 + return &container_of(iommu_table, struct pt_iommu_table, iommu) 206 + ->vtdss_pt.common; 207 + } 208 + 209 + static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 210 + { 211 + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) 212 + ->iommu; 213 + } 214 + 215 + static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, 216 + struct pt_write_attrs *attrs, 217 + unsigned int iommu_prot) 218 + { 219 + u64 pte = 0; 220 + 221 + /* 222 + * VTDSS does not have a present bit, so we tell if any entry is present 223 + * by checking for R or W. 224 + */ 225 + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) 226 + return -EINVAL; 227 + 228 + if (iommu_prot & IOMMU_READ) 229 + pte |= VTDSS_FMT_R; 230 + if (iommu_prot & IOMMU_WRITE) 231 + pte |= VTDSS_FMT_W; 232 + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) 233 + pte |= VTDSS_FMT_SNP; 234 + 235 + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && 236 + !(iommu_prot & IOMMU_WRITE)) { 237 + pr_err_ratelimited( 238 + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 239 + return -EINVAL; 240 + } 241 + 242 + attrs->descriptor_bits = pte; 243 + return 0; 244 + } 245 + #define pt_iommu_set_prot vtdss_pt_iommu_set_prot 246 + 247 + static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, 248 + const struct pt_iommu_vtdss_cfg *cfg) 249 + { 250 + struct pt_vtdss *table = &iommu_table->vtdss_pt; 251 + unsigned int vasz_lg2 = cfg->common.hw_max_vasz_lg2; 252 + 253 + if (vasz_lg2 > PT_MAX_VA_ADDRESS_LG2) 254 + return -EOPNOTSUPP; 255 + else if (vasz_lg2 > 48) 256 + pt_top_set_level(&table->common, 4); 257 + else if (vasz_lg2 > 39) 258 + pt_top_set_level(&table->common, 3); 259 + else if (vasz_lg2 > 30) 260 + pt_top_set_level(&table->common, 2); 261 + else 262 + return -EOPNOTSUPP; 263 + return 0; 264 + } 265 + #define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init 266 + 267 + static inline void 268 + vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, 269 + const struct pt_range *top_range, 270 + struct pt_iommu_vtdss_hw_info *info) 271 + { 272 + info->ssptptr = virt_to_phys(top_range->top_table); 273 + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); 274 + /* 275 + * top_level = 2 = 3 level table aw=1 276 + * top_level = 3 = 4 level table aw=2 277 + * top_level = 4 = 5 level table aw=3 278 + */ 279 + info->aw = top_range->top_level - 1; 280 + } 281 + #define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info 282 + 283 + #if defined(GENERIC_PT_KUNIT) 284 + static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { 285 + [0] = { .common.hw_max_vasz_lg2 = 39 }, 286 + [1] = { .common.hw_max_vasz_lg2 = 48 }, 287 + [2] = { .common.hw_max_vasz_lg2 = 57 }, 288 + }; 289 + #define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs 290 + enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; 291 + #endif 292 + #endif
+18
include/linux/generic_pt/common.h
··· 157 157 PT_FEAT_AMDV1_FORCE_COHERENCE, 158 158 }; 159 159 160 + struct pt_vtdss { 161 + struct pt_common common; 162 + }; 163 + 164 + enum { 165 + /* 166 + * The PTEs are set to prevent cache incoherent traffic, such as PCI no 167 + * snoop. This is set either at creation time or before the first map 168 + * operation. 169 + */ 170 + PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, 171 + /* 172 + * Prevent creating read-only PTEs. Used to work around HW errata 173 + * ERRATA_772415_SPR17. 174 + */ 175 + PT_FEAT_VTDSS_FORCE_WRITEABLE, 176 + }; 177 + 160 178 struct pt_x86_64 { 161 179 struct pt_common common; 162 180 };
+11
include/linux/generic_pt/iommu.h
··· 262 262 struct pt_iommu_amdv1_mock_hw_info; 263 263 IOMMU_PROTOTYPES(amdv1_mock); 264 264 265 + struct pt_iommu_vtdss_cfg { 266 + struct pt_iommu_cfg common; 267 + }; 268 + 269 + struct pt_iommu_vtdss_hw_info { 270 + u64 ssptptr; 271 + u8 aw; 272 + }; 273 + 274 + IOMMU_FORMAT(vtdss, vtdss_pt); 275 + 265 276 struct pt_iommu_x86_64_cfg { 266 277 struct pt_iommu_cfg common; 267 278 };