Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

x86/mm/pat: restore large ROX pages after fragmentation

Change of attributes of the pages may lead to fragmentation of direct
mapping over time and performance degradation when these pages contain
executable code.

With current code it's one way road: kernel tries to avoid splitting
large pages, but it doesn't restore them back even if page attributes
got compatible again.

Any change to the mapping may potentially allow to restore large page.

Add a hook to cpa_flush() path that will check if the pages in the range
that were just touched can be mapped at PMD level. If the collapse at the
PMD level succeeded, also attempt to collapse PUD level.

The collapse logic runs only when a set_memory_ method explicitly sets
CPA_COLLAPSE flag, for now this is only enabled in set_memory_rox().

CPUs don't like[1] to have to have TLB entries of different size for the
same memory, but looks like it's okay as long as these entries have
matching attributes[2]. Therefore it's critical to flush TLB before any
following changes to the mapping.

Note that we already allow for multiple TLB entries of different sizes
for the same memory now in split_large_page() path. It's not a new
situation.

set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
not remap such pages as large. Re-use one of software PTE bits to
indicate such pages.

[1] See Erratum 383 of AMD Family 10h Processors
[2] https://lore.kernel.org/linux-mm/1da1b025-cabc-6f04-bde5-e50830d1ecf0@amd.com/

[rppt@kernel.org:
* s/restore/collapse/
* update formatting per peterz
* use 'struct ptdesc' instead of 'struct page' for list of page tables to
be freed
* try to collapse PMD first and if it succeeds move on to PUD as peterz
suggested
* flush TLB twice: for changes done in the original CPA call and after
collapsing of large pages
* update commit message
]

Signed-off-by: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Co-developed-by: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Signed-off-by: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250126074733.1384926-4-rppt@kernel.org

authored by

Kirill A. Shutemov and committed by
Peter Zijlstra
41d88484 4ee788eb

+219 -4
+2
arch/x86/include/asm/pgtable_types.h
··· 33 33 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 34 34 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ 35 35 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 36 + #define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ 36 37 #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 37 38 38 39 #ifdef CONFIG_X86_64 ··· 65 64 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 66 65 #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 67 66 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 67 + #define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) 68 68 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 69 69 #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) 70 70 #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+213 -4
arch/x86/mm/pat/set_memory.c
··· 73 73 #define CPA_ARRAY 2 74 74 #define CPA_PAGES_ARRAY 4 75 75 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ 76 + #define CPA_COLLAPSE 16 /* try to collapse large pages */ 76 77 77 78 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) 78 79 { ··· 106 105 direct_pages_count[level - 1] += PTRS_PER_PTE; 107 106 } 108 107 108 + static void collapse_page_count(int level) 109 + { 110 + direct_pages_count[level]++; 111 + if (system_state == SYSTEM_RUNNING) { 112 + if (level == PG_LEVEL_2M) 113 + count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); 114 + else if (level == PG_LEVEL_1G) 115 + count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); 116 + } 117 + direct_pages_count[level - 1] -= PTRS_PER_PTE; 118 + } 119 + 109 120 void arch_report_meminfo(struct seq_file *m) 110 121 { 111 122 seq_printf(m, "DirectMap4k: %8lu kB\n", ··· 135 122 } 136 123 #else 137 124 static inline void split_page_count(int level) { } 125 + static inline void collapse_page_count(int level) { } 138 126 #endif 139 127 140 128 #ifdef CONFIG_X86_CPA_STATISTICS ··· 408 394 flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); 409 395 } 410 396 397 + static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); 398 + 399 + static void cpa_collapse_large_pages(struct cpa_data *cpa) 400 + { 401 + unsigned long start, addr, end; 402 + struct ptdesc *ptdesc, *tmp; 403 + LIST_HEAD(pgtables); 404 + int collapsed = 0; 405 + int i; 406 + 407 + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 408 + for (i = 0; i < cpa->numpages; i++) 409 + collapsed += collapse_large_pages(__cpa_addr(cpa, i), 410 + &pgtables); 411 + } else { 412 + addr = __cpa_addr(cpa, 0); 413 + start = addr & PMD_MASK; 414 + end = addr + PAGE_SIZE * cpa->numpages; 415 + 416 + for (addr = start; within(addr, start, end); addr += PMD_SIZE) 417 + collapsed += collapse_large_pages(addr, &pgtables); 418 + } 419 + 420 + if (!collapsed) 421 + return; 422 + 423 + flush_tlb_all(); 424 + 425 + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { 426 + list_del(&ptdesc->pt_list); 427 + __free_page(ptdesc_page(ptdesc)); 428 + } 429 + } 430 + 411 431 static void cpa_flush(struct cpa_data *cpa, int cache) 412 432 { 413 433 unsigned int i; ··· 450 402 451 403 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { 452 404 cpa_flush_all(cache); 453 - return; 405 + goto collapse_large_pages; 454 406 } 455 407 456 408 if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) ··· 459 411 on_each_cpu(__cpa_flush_tlb, cpa, 1); 460 412 461 413 if (!cache) 462 - return; 414 + goto collapse_large_pages; 463 415 464 416 mb(); 465 417 for (i = 0; i < cpa->numpages; i++) { ··· 475 427 clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); 476 428 } 477 429 mb(); 430 + 431 + collapse_large_pages: 432 + if (cpa->flags & CPA_COLLAPSE) 433 + cpa_collapse_large_pages(cpa); 478 434 } 479 435 480 436 static bool overlaps(unsigned long r1_start, unsigned long r1_end, ··· 1246 1194 __free_page(base); 1247 1195 1248 1196 return 0; 1197 + } 1198 + 1199 + static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, 1200 + struct list_head *pgtables) 1201 + { 1202 + pmd_t _pmd, old_pmd; 1203 + pte_t *pte, first; 1204 + unsigned long pfn; 1205 + pgprot_t pgprot; 1206 + int i = 0; 1207 + 1208 + addr &= PMD_MASK; 1209 + pte = pte_offset_kernel(pmd, addr); 1210 + first = *pte; 1211 + pfn = pte_pfn(first); 1212 + 1213 + /* Make sure alignment is suitable */ 1214 + if (PFN_PHYS(pfn) & ~PMD_MASK) 1215 + return 0; 1216 + 1217 + /* The page is 4k intentionally */ 1218 + if (pte_flags(first) & _PAGE_KERNEL_4K) 1219 + return 0; 1220 + 1221 + /* Check that the rest of PTEs are compatible with the first one */ 1222 + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { 1223 + pte_t entry = *pte; 1224 + 1225 + if (!pte_present(entry)) 1226 + return 0; 1227 + if (pte_flags(entry) != pte_flags(first)) 1228 + return 0; 1229 + if (pte_pfn(entry) != pte_pfn(first) + i) 1230 + return 0; 1231 + } 1232 + 1233 + old_pmd = *pmd; 1234 + 1235 + /* Success: set up a large page */ 1236 + pgprot = pgprot_4k_2_large(pte_pgprot(first)); 1237 + pgprot_val(pgprot) |= _PAGE_PSE; 1238 + _pmd = pfn_pmd(pfn, pgprot); 1239 + set_pmd(pmd, _pmd); 1240 + 1241 + /* Queue the page table to be freed after TLB flush */ 1242 + list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); 1243 + 1244 + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { 1245 + struct page *page; 1246 + 1247 + /* Update all PGD tables to use the same large page */ 1248 + list_for_each_entry(page, &pgd_list, lru) { 1249 + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); 1250 + p4d_t *p4d = p4d_offset(pgd, addr); 1251 + pud_t *pud = pud_offset(p4d, addr); 1252 + pmd_t *pmd = pmd_offset(pud, addr); 1253 + /* Something is wrong if entries doesn't match */ 1254 + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) 1255 + continue; 1256 + set_pmd(pmd, _pmd); 1257 + } 1258 + } 1259 + 1260 + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) 1261 + collapse_page_count(PG_LEVEL_2M); 1262 + 1263 + return 1; 1264 + } 1265 + 1266 + static int collapse_pud_page(pud_t *pud, unsigned long addr, 1267 + struct list_head *pgtables) 1268 + { 1269 + unsigned long pfn; 1270 + pmd_t *pmd, first; 1271 + int i; 1272 + 1273 + if (!direct_gbpages) 1274 + return 0; 1275 + 1276 + addr &= PUD_MASK; 1277 + pmd = pmd_offset(pud, addr); 1278 + first = *pmd; 1279 + 1280 + /* 1281 + * To restore PUD page all PMD entries must be large and 1282 + * have suitable alignment 1283 + */ 1284 + pfn = pmd_pfn(first); 1285 + if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) 1286 + return 0; 1287 + 1288 + /* 1289 + * To restore PUD page, all following PMDs must be compatible with the 1290 + * first one. 1291 + */ 1292 + for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { 1293 + pmd_t entry = *pmd; 1294 + 1295 + if (!pmd_present(entry) || !pmd_leaf(entry)) 1296 + return 0; 1297 + if (pmd_flags(entry) != pmd_flags(first)) 1298 + return 0; 1299 + if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) 1300 + return 0; 1301 + } 1302 + 1303 + /* Restore PUD page and queue page table to be freed after TLB flush */ 1304 + list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); 1305 + set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); 1306 + 1307 + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) 1308 + collapse_page_count(PG_LEVEL_1G); 1309 + 1310 + return 1; 1311 + } 1312 + 1313 + /* 1314 + * Collapse PMD and PUD pages in the kernel mapping around the address where 1315 + * possible. 1316 + * 1317 + * Caller must flush TLB and free page tables queued on the list before 1318 + * touching the new entries. CPU must not see TLB entries of different size 1319 + * with different attributes. 1320 + */ 1321 + static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) 1322 + { 1323 + int collapsed = 0; 1324 + pgd_t *pgd; 1325 + p4d_t *p4d; 1326 + pud_t *pud; 1327 + pmd_t *pmd; 1328 + 1329 + addr &= PMD_MASK; 1330 + 1331 + spin_lock(&pgd_lock); 1332 + pgd = pgd_offset_k(addr); 1333 + if (pgd_none(*pgd)) 1334 + goto out; 1335 + p4d = p4d_offset(pgd, addr); 1336 + if (p4d_none(*p4d)) 1337 + goto out; 1338 + pud = pud_offset(p4d, addr); 1339 + if (!pud_present(*pud) || pud_leaf(*pud)) 1340 + goto out; 1341 + pmd = pmd_offset(pud, addr); 1342 + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) 1343 + goto out; 1344 + 1345 + collapsed = collapse_pmd_page(pmd, addr, pgtables); 1346 + if (collapsed) 1347 + collapsed += collapse_pud_page(pud, addr, pgtables); 1348 + 1349 + out: 1350 + spin_unlock(&pgd_lock); 1351 + return collapsed; 1249 1352 } 1250 1353 1251 1354 static bool try_to_free_pte_page(pte_t *pte) ··· 2326 2119 if (__supported_pte_mask & _PAGE_NX) 2327 2120 clr.pgprot |= _PAGE_NX; 2328 2121 2329 - return change_page_attr_clear(&addr, numpages, clr, 0); 2122 + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, 2123 + CPA_COLLAPSE, NULL); 2330 2124 } 2331 2125 2332 2126 int set_memory_rw(unsigned long addr, int numpages) ··· 2354 2146 2355 2147 int set_memory_4k(unsigned long addr, int numpages) 2356 2148 { 2357 - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 2149 + return change_page_attr_set_clr(&addr, numpages, 2150 + __pgprot(_PAGE_KERNEL_4K), 2358 2151 __pgprot(0), 1, 0, NULL); 2359 2152 } 2360 2153
+2
include/linux/vm_event_item.h
··· 151 151 #ifdef CONFIG_X86 152 152 DIRECT_MAP_LEVEL2_SPLIT, 153 153 DIRECT_MAP_LEVEL3_SPLIT, 154 + DIRECT_MAP_LEVEL2_COLLAPSE, 155 + DIRECT_MAP_LEVEL3_COLLAPSE, 154 156 #endif 155 157 #ifdef CONFIG_PER_VMA_LOCK_STATS 156 158 VMA_LOCK_SUCCESS,
+2
mm/vmstat.c
··· 1435 1435 #ifdef CONFIG_X86 1436 1436 "direct_map_level2_splits", 1437 1437 "direct_map_level3_splits", 1438 + "direct_map_level2_collapses", 1439 + "direct_map_level3_collapses", 1438 1440 #endif 1439 1441 #ifdef CONFIG_PER_VMA_LOCK_STATS 1440 1442 "vma_lock_success",