Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RISC-V: KVM: Transparent huge page support

Use block mapping if backed by a THP, as implemented in architectures
like ARM and x86_64.

Signed-off-by: Jessica Liu <liu.xuemei1@zte.com.cn>
Reviewed-by: Anup Patel <anup@brainfault.org>
Link: https://lore.kernel.org/r/20251127165137780QbUOVPKPAfWSGAFl5qtRy@zte.com.cn
Signed-off-by: Anup Patel <anup@brainfault.org>

authored by

Jessica Liu and committed by
Anup Patel
ed7ae7a3 671995ff

+142
+140
arch/riscv/kvm/mmu.c
··· 305 305 return pte_young(ptep_get(ptep)); 306 306 } 307 307 308 + static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot, 309 + unsigned long hva) 310 + { 311 + hva_t uaddr_start, uaddr_end; 312 + gpa_t gpa_start; 313 + size_t size; 314 + 315 + size = memslot->npages * PAGE_SIZE; 316 + uaddr_start = memslot->userspace_addr; 317 + uaddr_end = uaddr_start + size; 318 + 319 + gpa_start = memslot->base_gfn << PAGE_SHIFT; 320 + 321 + /* 322 + * Pages belonging to memslots that don't have the same alignment 323 + * within a PMD for userspace and GPA cannot be mapped with g-stage 324 + * PMD entries, because we'll end up mapping the wrong pages. 325 + * 326 + * Consider a layout like the following: 327 + * 328 + * memslot->userspace_addr: 329 + * +-----+--------------------+--------------------+---+ 330 + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| 331 + * +-----+--------------------+--------------------+---+ 332 + * 333 + * memslot->base_gfn << PAGE_SHIFT: 334 + * +---+--------------------+--------------------+-----+ 335 + * |abc|def g-stage block | g-stage block |tvxyz| 336 + * +---+--------------------+--------------------+-----+ 337 + * 338 + * If we create those g-stage blocks, we'll end up with this incorrect 339 + * mapping: 340 + * d -> f 341 + * e -> g 342 + * f -> h 343 + */ 344 + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) 345 + return false; 346 + 347 + /* 348 + * Next, let's make sure we're not trying to map anything not covered 349 + * by the memslot. This means we have to prohibit block size mappings 350 + * for the beginning and end of a non-block aligned and non-block sized 351 + * memory slot (illustrated by the head and tail parts of the 352 + * userspace view above containing pages 'abcde' and 'xyz', 353 + * respectively). 354 + * 355 + * Note that it doesn't matter if we do the check using the 356 + * userspace_addr or the base_gfn, as both are equally aligned (per 357 + * the check above) and equally sized. 358 + */ 359 + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); 360 + } 361 + 362 + static int get_hva_mapping_size(struct kvm *kvm, 363 + unsigned long hva) 364 + { 365 + int size = PAGE_SIZE; 366 + unsigned long flags; 367 + pgd_t pgd; 368 + p4d_t p4d; 369 + pud_t pud; 370 + pmd_t pmd; 371 + 372 + /* 373 + * Disable IRQs to prevent concurrent tear down of host page tables, 374 + * e.g. if the primary MMU promotes a P*D to a huge page and then frees 375 + * the original page table. 376 + */ 377 + local_irq_save(flags); 378 + 379 + /* 380 + * Read each entry once. As above, a non-leaf entry can be promoted to 381 + * a huge page _during_ this walk. Re-reading the entry could send the 382 + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old 383 + * value) and then p*d_offset() walks into the target huge page instead 384 + * of the old page table (sees the new value). 385 + */ 386 + pgd = pgdp_get(pgd_offset(kvm->mm, hva)); 387 + if (pgd_none(pgd)) 388 + goto out; 389 + 390 + p4d = p4dp_get(p4d_offset(&pgd, hva)); 391 + if (p4d_none(p4d) || !p4d_present(p4d)) 392 + goto out; 393 + 394 + pud = pudp_get(pud_offset(&p4d, hva)); 395 + if (pud_none(pud) || !pud_present(pud)) 396 + goto out; 397 + 398 + if (pud_leaf(pud)) { 399 + size = PUD_SIZE; 400 + goto out; 401 + } 402 + 403 + pmd = pmdp_get(pmd_offset(&pud, hva)); 404 + if (pmd_none(pmd) || !pmd_present(pmd)) 405 + goto out; 406 + 407 + if (pmd_leaf(pmd)) 408 + size = PMD_SIZE; 409 + 410 + out: 411 + local_irq_restore(flags); 412 + return size; 413 + } 414 + 415 + static unsigned long transparent_hugepage_adjust(struct kvm *kvm, 416 + struct kvm_memory_slot *memslot, 417 + unsigned long hva, 418 + kvm_pfn_t *hfnp, gpa_t *gpa) 419 + { 420 + kvm_pfn_t hfn = *hfnp; 421 + 422 + /* 423 + * Make sure the adjustment is done only for THP pages. Also make 424 + * sure that the HVA and GPA are sufficiently aligned and that the 425 + * block map is contained within the memslot. 426 + */ 427 + if (fault_supports_gstage_huge_mapping(memslot, hva)) { 428 + int sz; 429 + 430 + sz = get_hva_mapping_size(kvm, hva); 431 + if (sz < PMD_SIZE) 432 + return sz; 433 + 434 + *gpa &= PMD_MASK; 435 + hfn &= ~(PTRS_PER_PMD - 1); 436 + *hfnp = hfn; 437 + 438 + return PMD_SIZE; 439 + } 440 + 441 + return PAGE_SIZE; 442 + } 443 + 308 444 int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, 309 445 gpa_t gpa, unsigned long hva, bool is_write, 310 446 struct kvm_gstage_mapping *out_map) ··· 533 397 534 398 if (mmu_invalidate_retry(kvm, mmu_seq)) 535 399 goto out_unlock; 400 + 401 + /* Check if we are backed by a THP and thus use block mapping if possible */ 402 + if (vma_pagesize == PAGE_SIZE) 403 + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa); 536 404 537 405 if (writable) { 538 406 mark_page_dirty_in_slot(kvm, memslot, gfn);
+2
arch/riscv/mm/pgtable.c
··· 47 47 48 48 return (pud_t *)p4d; 49 49 } 50 + EXPORT_SYMBOL_GPL(pud_offset); 50 51 51 52 p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) 52 53 { ··· 56 55 57 56 return (p4d_t *)pgd; 58 57 } 58 + EXPORT_SYMBOL_GPL(p4d_offset); 59 59 #endif 60 60 61 61 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP