Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

iommupt: Add the x86 64 bit page table format

This is used by x86 CPUs and can be used in AMD/VT-d x86 IOMMUs. When a
x86 IOMMU is running SVA the MM will be using this format.

This implementation follows the AMD v2 io-pgtable version.

There is nothing remarkable here, the format can have 4 or 5 levels and
limited support for different page sizes. No contiguous pages support.

x86 uses a sign extension mechanism where the top bits of the VA must
match the sign bit. The core code supports this through
PT_FEAT_SIGN_EXTEND which creates and upper and lower VA range. All the
new operations will work correctly in both spaces, however currently there
is no way to report the upper space to other layers. Future patches can
improve that.

In principle this can support 3 page tables levels matching the 32 bit PAE
table format, but no iommu driver needs this. The focus is on the modern
64 bit 4 and 5 level formats.

Comparing the performance of several operations to the existing version:

iommu_map()
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 71,61 , 66,58 , -13.13
2^21, 66,60 , 61,55 , -10.10
2^30, 59,56 , 56,54 , -3.03
256*2^12, 392,1360 , 345,1289 , 73.73
256*2^21, 383,1159 , 335,1145 , 70.70
256*2^30, 378,965 , 331,892 , 62.62

iommu_unmap()
pgsz ,avg new,old ns, min new,old ns , min % (+ve is better)
2^12, 77,71 , 73,68 , -7.07
2^21, 76,70 , 70,66 , -6.06
2^30, 69,66 , 66,63 , -4.04
256*2^12, 225,899 , 210,870 , 75.75
256*2^21, 262,722 , 248,710 , 65.65
256*2^30, 251,643 , 244,634 , 61.61

The small -ve values in the iommu_unmap() are due to the core code calling
iommu_pgsize() before invoking the domain op. This is unncessary with this
implementation. Future work optimizes this and gets to 2%, 4%, 3%.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Tested-by: Alejandro Jimenez <alejandro.j.jimenez@oracle.com>
Tested-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>

authored by

Jason Gunthorpe and committed by
Joerg Roedel
aef5de75 e93d5945

+325
+1
drivers/iommu/generic_pt/.kunitconfig
··· 3 3 CONFIG_DEBUG_GENERIC_PT=y 4 4 CONFIG_IOMMU_PT=y 5 5 CONFIG_IOMMU_PT_AMDV1=y 6 + CONFIG_IOMMU_PT_X86_64=y 6 7 CONFIG_IOMMU_PT_KUNIT_TEST=y 7 8 8 9 CONFIG_IOMMUFD=y
+11
drivers/iommu/generic_pt/Kconfig
··· 42 42 43 43 Selected automatically by an IOMMU driver that uses this format. 44 44 45 + config IOMMU_PT_X86_64 46 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" 47 + depends on !GENERIC_ATOMIC64 # for cmpxchg64 48 + help 49 + iommu_domain implementation for the x86 64-bit 4/5 level page table. 50 + It supports 4K/2M/1G page sizes and can decode a sign-extended 51 + portion of the 64-bit IOVA space. 52 + 53 + Selected automatically by an IOMMU driver that uses this format. 54 + 45 55 config IOMMU_PT_KUNIT_TEST 46 56 tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS 47 57 depends on KUNIT 48 58 depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 59 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 49 60 default KUNIT_ALL_TESTS 50 61 help 51 62 Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the
+2
drivers/iommu/generic_pt/fmt/Makefile
··· 3 3 iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 4 4 iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock 5 5 6 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 7 + 6 8 IOMMU_PT_KUNIT_TEST := 7 9 define create_format 8 10 obj-$(2) += iommu_$(1).o
+21
drivers/iommu/generic_pt/fmt/defs_x86_64.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + */ 6 + #ifndef __GENERIC_PT_FMT_DEFS_X86_64_H 7 + #define __GENERIC_PT_FMT_DEFS_X86_64_H 8 + 9 + #include <linux/generic_pt/common.h> 10 + #include <linux/types.h> 11 + 12 + typedef u64 pt_vaddr_t; 13 + typedef u64 pt_oaddr_t; 14 + 15 + struct x86_64_pt_write_attrs { 16 + u64 descriptor_bits; 17 + gfp_t gfp; 18 + }; 19 + #define pt_write_attrs x86_64_pt_write_attrs 20 + 21 + #endif
+11
drivers/iommu/generic_pt/fmt/iommu_x86_64.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define PT_FMT x86_64 6 + #define PT_SUPPORTED_FEATURES \ 7 + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ 8 + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ 9 + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 10 + 11 + #include "iommu_template.h"
+255
drivers/iommu/generic_pt/fmt/x86_64.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * x86 page table. Supports the 4 and 5 level variations. 6 + * 7 + * The 4 and 5 level version is described in: 8 + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software 9 + * Developer's Manual Volume 3 10 + * 11 + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization 12 + * Technology for Directed I/O Architecture Specification" 13 + * 14 + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O 15 + * Virtualization Technology (IOMMU) Specification" 16 + * 17 + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. 18 + * 19 + * Note the 3 level format is very similar and almost implemented here. The 20 + * reserved/ignored layout is different and there are functional bit 21 + * differences. 22 + * 23 + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower 24 + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign 25 + * extended addressing with this page table format. 26 + * 27 + * The named levels in the spec map to the pts->level as: 28 + * Table/PTE - 0 29 + * Directory/PDE - 1 30 + * Directory Ptr/PDPTE - 2 31 + * PML4/PML4E - 3 32 + * PML5/PML5E - 4 33 + */ 34 + #ifndef __GENERIC_PT_FMT_X86_64_H 35 + #define __GENERIC_PT_FMT_X86_64_H 36 + 37 + #include "defs_x86_64.h" 38 + #include "../pt_defs.h" 39 + 40 + #include <linux/bitfield.h> 41 + #include <linux/container_of.h> 42 + #include <linux/log2.h> 43 + #include <linux/mem_encrypt.h> 44 + 45 + enum { 46 + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, 47 + PT_MAX_VA_ADDRESS_LG2 = 57, 48 + PT_ITEM_WORD_SIZE = sizeof(u64), 49 + PT_MAX_TOP_LEVEL = 4, 50 + PT_GRANULE_LG2SZ = 12, 51 + PT_TABLEMEM_LG2SZ = 12, 52 + 53 + /* 54 + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k 55 + * aligned and is limited by the architected HAW 56 + */ 57 + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), 58 + }; 59 + 60 + /* Shared descriptor bits */ 61 + enum { 62 + X86_64_FMT_P = BIT(0), 63 + X86_64_FMT_RW = BIT(1), 64 + X86_64_FMT_U = BIT(2), 65 + X86_64_FMT_A = BIT(5), 66 + X86_64_FMT_D = BIT(6), 67 + X86_64_FMT_OA = GENMASK_ULL(51, 12), 68 + X86_64_FMT_XD = BIT_ULL(63), 69 + }; 70 + 71 + /* PDPTE/PDE */ 72 + enum { 73 + X86_64_FMT_PS = BIT(7), 74 + }; 75 + 76 + static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) 77 + { 78 + u64 entry = pts->entry; 79 + 80 + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 81 + entry = __sme_clr(entry); 82 + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), 83 + PT_TABLEMEM_LG2SZ); 84 + } 85 + #define pt_table_pa x86_64_pt_table_pa 86 + 87 + static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) 88 + { 89 + u64 entry = pts->entry; 90 + 91 + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 92 + entry = __sme_clr(entry); 93 + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), 94 + PT_GRANULE_LG2SZ); 95 + } 96 + #define pt_entry_oa x86_64_pt_entry_oa 97 + 98 + static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) 99 + { 100 + return pts->level <= 2; 101 + } 102 + #define pt_can_have_leaf x86_64_pt_can_have_leaf 103 + 104 + static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) 105 + { 106 + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 107 + } 108 + #define pt_num_items_lg2 x86_64_pt_num_items_lg2 109 + 110 + static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) 111 + { 112 + const u64 *tablep = pt_cur_table(pts, u64); 113 + u64 entry; 114 + 115 + pts->entry = entry = READ_ONCE(tablep[pts->index]); 116 + if (!(entry & X86_64_FMT_P)) 117 + return PT_ENTRY_EMPTY; 118 + if (pts->level == 0 || 119 + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) 120 + return PT_ENTRY_OA; 121 + return PT_ENTRY_TABLE; 122 + } 123 + #define pt_load_entry_raw x86_64_pt_load_entry_raw 124 + 125 + static inline void 126 + x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 127 + unsigned int oasz_lg2, 128 + const struct pt_write_attrs *attrs) 129 + { 130 + u64 *tablep = pt_cur_table(pts, u64); 131 + u64 entry; 132 + 133 + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 134 + return; 135 + 136 + entry = X86_64_FMT_P | 137 + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | 138 + attrs->descriptor_bits; 139 + if (pts->level != 0) 140 + entry |= X86_64_FMT_PS; 141 + 142 + WRITE_ONCE(tablep[pts->index], entry); 143 + pts->entry = entry; 144 + } 145 + #define pt_install_leaf_entry x86_64_pt_install_leaf_entry 146 + 147 + static inline bool x86_64_pt_install_table(struct pt_state *pts, 148 + pt_oaddr_t table_pa, 149 + const struct pt_write_attrs *attrs) 150 + { 151 + u64 entry; 152 + 153 + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | 154 + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); 155 + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 156 + entry = __sme_set(entry); 157 + return pt_table_install64(pts, entry); 158 + } 159 + #define pt_install_table x86_64_pt_install_table 160 + 161 + static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, 162 + struct pt_write_attrs *attrs) 163 + { 164 + attrs->descriptor_bits = pts->entry & 165 + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | 166 + X86_64_FMT_D | X86_64_FMT_XD); 167 + } 168 + #define pt_attr_from_entry x86_64_pt_attr_from_entry 169 + 170 + /* --- iommu */ 171 + #include <linux/generic_pt/iommu.h> 172 + #include <linux/iommu.h> 173 + 174 + #define pt_iommu_table pt_iommu_x86_64 175 + 176 + /* The common struct is in the per-format common struct */ 177 + static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 178 + { 179 + return &container_of(iommu_table, struct pt_iommu_table, iommu) 180 + ->x86_64_pt.common; 181 + } 182 + 183 + static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 184 + { 185 + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) 186 + ->iommu; 187 + } 188 + 189 + static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, 190 + struct pt_write_attrs *attrs, 191 + unsigned int iommu_prot) 192 + { 193 + u64 pte; 194 + 195 + pte = X86_64_FMT_U | X86_64_FMT_A | X86_64_FMT_D; 196 + if (iommu_prot & IOMMU_WRITE) 197 + pte |= X86_64_FMT_RW; 198 + 199 + /* 200 + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to 201 + * control this. For now if the tables use sme_set then so do the ptes. 202 + */ 203 + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 204 + pte = __sme_set(pte); 205 + 206 + attrs->descriptor_bits = pte; 207 + return 0; 208 + } 209 + #define pt_iommu_set_prot x86_64_pt_iommu_set_prot 210 + 211 + static inline int 212 + x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, 213 + const struct pt_iommu_x86_64_cfg *cfg) 214 + { 215 + struct pt_x86_64 *table = &iommu_table->x86_64_pt; 216 + 217 + if (cfg->common.hw_max_vasz_lg2 < 31 || 218 + cfg->common.hw_max_vasz_lg2 > 57) 219 + return -EINVAL; 220 + 221 + /* Top of 2, 3, 4 */ 222 + pt_top_set_level(&table->common, 223 + (cfg->common.hw_max_vasz_lg2 - 31) / 9 + 2); 224 + 225 + table->common.max_oasz_lg2 = 226 + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); 227 + return 0; 228 + } 229 + #define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init 230 + 231 + static inline void 232 + x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, 233 + const struct pt_range *top_range, 234 + struct pt_iommu_x86_64_hw_info *info) 235 + { 236 + info->gcr3_pt = virt_to_phys(top_range->top_table); 237 + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); 238 + info->levels = top_range->top_level + 1; 239 + } 240 + #define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info 241 + 242 + #if defined(GENERIC_PT_KUNIT) 243 + static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { 244 + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), 245 + .common.hw_max_vasz_lg2 = 48 }, 246 + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), 247 + .common.hw_max_vasz_lg2 = 57 }, 248 + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ 249 + [2] = { .common.hw_max_vasz_lg2 = 47 }, 250 + [3] = { .common.hw_max_vasz_lg2 = 56 }, 251 + }; 252 + #define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs 253 + enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; 254 + #endif 255 + #endif
+13
include/linux/generic_pt/common.h
··· 151 151 PT_FEAT_AMDV1_FORCE_COHERENCE, 152 152 }; 153 153 154 + struct pt_x86_64 { 155 + struct pt_common common; 156 + }; 157 + 158 + enum { 159 + /* 160 + * The memory backing the tables is encrypted. Use __sme_set() to adjust 161 + * the page table pointers in the tree. This only works with 162 + * CONFIG_AMD_MEM_ENCRYPT. 163 + */ 164 + PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, 165 + }; 166 + 154 167 #endif
+11
include/linux/generic_pt/iommu.h
··· 255 255 struct pt_iommu_amdv1_mock_hw_info; 256 256 IOMMU_PROTOTYPES(amdv1_mock); 257 257 258 + struct pt_iommu_x86_64_cfg { 259 + struct pt_iommu_cfg common; 260 + }; 261 + 262 + struct pt_iommu_x86_64_hw_info { 263 + u64 gcr3_pt; 264 + u8 levels; 265 + }; 266 + 267 + IOMMU_FORMAT(x86_64, x86_64_pt); 268 + 258 269 #undef IOMMU_PROTOTYPES 259 270 #undef IOMMU_FORMAT 260 271 #endif