Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

KVM: s390: Switch to new gmap

Switch KVM/s390 to use the new gmap code.

Remove includes to <gmap.h> and include "gmap.h" instead; fix all the
existing users of the old gmap functions to use the new ones instead.

Fix guest storage key access functions to work with the new gmap.

Acked-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com>

+1129 -1736
+1 -1
arch/s390/Kconfig
··· 33 33 def_bool y if PREEMPTION 34 34 35 35 config PGSTE 36 - def_bool y if KVM 36 + def_bool n 37 37 38 38 config AUDIT_ARCH 39 39 def_bool y
+4 -1
arch/s390/include/asm/kvm_host.h
··· 442 442 bool acrs_loaded; 443 443 struct kvm_s390_pv_vcpu pv; 444 444 union diag318_info diag318_info; 445 - void *mc; /* Placeholder */ 445 + struct kvm_s390_mmu_cache *mc; 446 446 }; 447 447 448 448 struct kvm_vm_stat { ··· 636 636 struct mutex import_lock; 637 637 }; 638 638 639 + struct kvm_s390_mmu_cache; 640 + 639 641 struct kvm_arch { 640 642 struct esca_block *sca; 641 643 debug_info_t *dbf; ··· 677 675 struct kvm_s390_pv pv; 678 676 struct list_head kzdev_list; 679 677 spinlock_t kzdev_list_lock; 678 + struct kvm_s390_mmu_cache *mc; 680 679 }; 681 680 682 681 #define KVM_HVA_ERR_BAD (-1UL)
-4
arch/s390/include/asm/mmu_context.h
··· 30 30 mm->context.gmap_asce = 0; 31 31 mm->context.flush_mm = 0; 32 32 #if IS_ENABLED(CONFIG_KVM) 33 - mm->context.has_pgste = 0; 34 - mm->context.uses_skeys = 0; 35 - mm->context.uses_cmm = 0; 36 33 mm->context.allow_cow_sharing = 1; 37 - mm->context.allow_gmap_hpage_1m = 0; 38 34 #endif 39 35 switch (mm->context.asce_limit) { 40 36 default:
-3
arch/s390/include/asm/tlb.h
··· 36 36 37 37 #include <asm/tlbflush.h> 38 38 #include <asm-generic/tlb.h> 39 - #include <asm/gmap.h> 40 39 41 40 /* 42 41 * Release the page cache reference for a pte removed by ··· 84 85 tlb->mm->context.flush_mm = 1; 85 86 tlb->freed_tables = 1; 86 87 tlb->cleared_pmds = 1; 87 - if (mm_has_pgste(tlb->mm)) 88 - gmap_unlink(tlb->mm, (unsigned long *)pte, address); 89 88 tlb_remove_ptdesc(tlb, virt_to_ptdesc(pte)); 90 89 } 91 90
+10 -60
arch/s390/include/asm/uaccess.h
··· 471 471 #define arch_get_kernel_nofault __mvc_kernel_nofault 472 472 #define arch_put_kernel_nofault __mvc_kernel_nofault 473 473 474 - void __cmpxchg_user_key_called_with_bad_pointer(void); 475 - 476 - int __cmpxchg_user_key1(unsigned long address, unsigned char *uval, 477 - unsigned char old, unsigned char new, unsigned long key); 478 - int __cmpxchg_user_key2(unsigned long address, unsigned short *uval, 479 - unsigned short old, unsigned short new, unsigned long key); 480 - int __cmpxchg_user_key4(unsigned long address, unsigned int *uval, 481 - unsigned int old, unsigned int new, unsigned long key); 482 - int __cmpxchg_user_key8(unsigned long address, unsigned long *uval, 483 - unsigned long old, unsigned long new, unsigned long key); 484 - int __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, 485 - __uint128_t old, __uint128_t new, unsigned long key); 486 - 487 - static __always_inline int _cmpxchg_user_key(unsigned long address, void *uval, 488 - __uint128_t old, __uint128_t new, 489 - unsigned long key, int size) 490 - { 491 - switch (size) { 492 - case 1: return __cmpxchg_user_key1(address, uval, old, new, key); 493 - case 2: return __cmpxchg_user_key2(address, uval, old, new, key); 494 - case 4: return __cmpxchg_user_key4(address, uval, old, new, key); 495 - case 8: return __cmpxchg_user_key8(address, uval, old, new, key); 496 - case 16: return __cmpxchg_user_key16(address, uval, old, new, key); 497 - default: __cmpxchg_user_key_called_with_bad_pointer(); 498 - } 499 - return 0; 500 - } 501 - 502 - /** 503 - * cmpxchg_user_key() - cmpxchg with user space target, honoring storage keys 504 - * @ptr: User space address of value to compare to @old and exchange with 505 - * @new. Must be aligned to sizeof(*@ptr). 506 - * @uval: Address where the old value of *@ptr is written to. 507 - * @old: Old value. Compared to the content pointed to by @ptr in order to 508 - * determine if the exchange occurs. The old value read from *@ptr is 509 - * written to *@uval. 510 - * @new: New value to place at *@ptr. 511 - * @key: Access key to use for checking storage key protection. 512 - * 513 - * Perform a cmpxchg on a user space target, honoring storage key protection. 514 - * @key alone determines how key checking is performed, neither 515 - * storage-protection-override nor fetch-protection-override apply. 516 - * The caller must compare *@uval and @old to determine if values have been 517 - * exchanged. In case of an exception *@uval is set to zero. 518 - * 519 - * Return: 0: cmpxchg executed 520 - * -EFAULT: an exception happened when trying to access *@ptr 521 - * -EAGAIN: maxed out number of retries (byte and short only) 522 - */ 523 - #define cmpxchg_user_key(ptr, uval, old, new, key) \ 524 - ({ \ 525 - __typeof__(ptr) __ptr = (ptr); \ 526 - __typeof__(uval) __uval = (uval); \ 527 - \ 528 - BUILD_BUG_ON(sizeof(*(__ptr)) != sizeof(*(__uval))); \ 529 - might_fault(); \ 530 - __chk_user_ptr(__ptr); \ 531 - _cmpxchg_user_key((unsigned long)(__ptr), (void *)(__uval), \ 532 - (old), (new), (key), sizeof(*(__ptr))); \ 533 - }) 474 + int __cmpxchg_key1(void *address, unsigned char *uval, unsigned char old, 475 + unsigned char new, unsigned long key); 476 + int __cmpxchg_key2(void *address, unsigned short *uval, unsigned short old, 477 + unsigned short new, unsigned long key); 478 + int __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, 479 + unsigned int new, unsigned long key); 480 + int __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, 481 + unsigned long new, unsigned long key); 482 + int __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, 483 + __uint128_t new, unsigned long key); 534 484 535 485 #endif /* __S390_UACCESS_H */
-1
arch/s390/include/asm/uv.h
··· 631 631 int uv_destroy_folio(struct folio *folio); 632 632 int uv_destroy_pte(pte_t pte); 633 633 int uv_convert_from_secure_pte(pte_t pte); 634 - int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb); 635 634 int s390_wiggle_split_folio(struct mm_struct *mm, struct folio *folio); 636 635 int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb); 637 636 int uv_convert_from_secure(unsigned long paddr);
+7 -107
arch/s390/kernel/uv.c
··· 209 209 return uv_convert_from_secure_folio(pfn_folio(pte_pfn(pte))); 210 210 } 211 211 212 - /** 213 - * should_export_before_import - Determine whether an export is needed 214 - * before an import-like operation 215 - * @uvcb: the Ultravisor control block of the UVC to be performed 216 - * @mm: the mm of the process 217 - * 218 - * Returns whether an export is needed before every import-like operation. 219 - * This is needed for shared pages, which don't trigger a secure storage 220 - * exception when accessed from a different guest. 221 - * 222 - * Although considered as one, the Unpin Page UVC is not an actual import, 223 - * so it is not affected. 224 - * 225 - * No export is needed also when there is only one protected VM, because the 226 - * page cannot belong to the wrong VM in that case (there is no "other VM" 227 - * it can belong to). 228 - * 229 - * Return: true if an export is needed before every import, otherwise false. 230 - */ 231 - static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) 232 - { 233 - /* 234 - * The misc feature indicates, among other things, that importing a 235 - * shared page from a different protected VM will automatically also 236 - * transfer its ownership. 237 - */ 238 - if (uv_has_feature(BIT_UV_FEAT_MISC)) 239 - return false; 240 - if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) 241 - return false; 242 - return atomic_read(&mm->context.protected_count) > 1; 243 - } 244 - 245 212 /* 246 213 * Calculate the expected ref_count for a folio that would otherwise have no 247 214 * further pins. This was cribbed from similar functions in other places in ··· 279 312 return uvcb->rc == 0x10a ? -ENXIO : -EINVAL; 280 313 } 281 314 EXPORT_SYMBOL(__make_folio_secure); 282 - 283 - static int make_folio_secure(struct mm_struct *mm, struct folio *folio, struct uv_cb_header *uvcb) 284 - { 285 - int rc; 286 - 287 - if (!folio_trylock(folio)) 288 - return -EAGAIN; 289 - if (should_export_before_import(uvcb, mm)) 290 - uv_convert_from_secure(folio_to_phys(folio)); 291 - rc = __make_folio_secure(folio, uvcb); 292 - folio_unlock(folio); 293 - 294 - return rc; 295 - } 296 315 297 316 /** 298 317 * s390_wiggle_split_folio() - try to drain extra references to a folio and ··· 367 414 } 368 415 EXPORT_SYMBOL_GPL(s390_wiggle_split_folio); 369 416 370 - int make_hva_secure(struct mm_struct *mm, unsigned long hva, struct uv_cb_header *uvcb) 371 - { 372 - struct vm_area_struct *vma; 373 - struct folio_walk fw; 374 - struct folio *folio; 375 - int rc; 376 - 377 - mmap_read_lock(mm); 378 - vma = vma_lookup(mm, hva); 379 - if (!vma) { 380 - mmap_read_unlock(mm); 381 - return -EFAULT; 382 - } 383 - folio = folio_walk_start(&fw, vma, hva, 0); 384 - if (!folio) { 385 - mmap_read_unlock(mm); 386 - return -ENXIO; 387 - } 388 - 389 - folio_get(folio); 390 - /* 391 - * Secure pages cannot be huge and userspace should not combine both. 392 - * In case userspace does it anyway this will result in an -EFAULT for 393 - * the unpack. The guest is thus never reaching secure mode. 394 - * If userspace plays dirty tricks and decides to map huge pages at a 395 - * later point in time, it will receive a segmentation fault or 396 - * KVM_RUN will return -EFAULT. 397 - */ 398 - if (folio_test_hugetlb(folio)) 399 - rc = -EFAULT; 400 - else if (folio_test_large(folio)) 401 - rc = -E2BIG; 402 - else if (!pte_write(fw.pte) || (pte_val(fw.pte) & _PAGE_INVALID)) 403 - rc = -ENXIO; 404 - else 405 - rc = make_folio_secure(mm, folio, uvcb); 406 - folio_walk_end(&fw, vma); 407 - mmap_read_unlock(mm); 408 - 409 - if (rc == -E2BIG || rc == -EBUSY) { 410 - rc = s390_wiggle_split_folio(mm, folio); 411 - if (!rc) 412 - rc = -EAGAIN; 413 - } 414 - folio_put(folio); 415 - 416 - return rc; 417 - } 418 - EXPORT_SYMBOL_GPL(make_hva_secure); 419 - 420 417 /* 421 418 * To be called with the folio locked or with an extra reference! This will 422 419 * prevent kvm_s390_pv_make_secure() from touching the folio concurrently. ··· 377 474 { 378 475 int rc = 0; 379 476 380 - /* Large folios cannot be secure */ 381 - if (unlikely(folio_test_large(folio))) 382 - return 0; 383 - 384 477 /* 385 - * PG_arch_1 is used in 2 places: 386 - * 1. for storage keys of hugetlb folios and KVM 387 - * 2. As an indication that this small folio might be secure. This can 388 - * overindicate, e.g. we set the bit before calling 389 - * convert_to_secure. 390 - * As secure pages are never large folios, both variants can co-exists. 478 + * PG_arch_1 is used as an indication that this small folio might be 479 + * secure. This can overindicate, e.g. we set the bit before calling 480 + * convert_to_secure. 391 481 */ 392 482 if (!test_bit(PG_arch_1, &folio->flags.f)) 393 483 return 0; 484 + 485 + /* Large folios cannot be secure. */ 486 + if (WARN_ON_ONCE(folio_test_large(folio))) 487 + return -EFAULT; 394 488 395 489 rc = uv_pin_shared(folio_to_phys(folio)); 396 490 if (!rc) {
+1 -1
arch/s390/kvm/Makefile
··· 8 8 ccflags-y := -Ivirt/kvm -Iarch/s390/kvm 9 9 10 10 kvm-y += kvm-s390.o intercept.o interrupt.o priv.o sigp.o 11 - kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o gmap-vsie.o 11 + kvm-y += diag.o gaccess.o guestdbg.o vsie.o pv.o 12 12 kvm-y += dat.o gmap.o faultin.o 13 13 14 14 kvm-$(CONFIG_VFIO_PCI_ZDEV_KVM) += pci.o
+1 -1
arch/s390/kvm/diag.c
··· 10 10 11 11 #include <linux/kvm.h> 12 12 #include <linux/kvm_host.h> 13 - #include <asm/gmap.h> 14 13 #include <asm/gmap_helpers.h> 15 14 #include <asm/virtio-ccw.h> 16 15 #include "kvm-s390.h" 17 16 #include "trace.h" 18 17 #include "trace-s390.h" 19 18 #include "gaccess.h" 19 + #include "gmap.h" 20 20 21 21 static void do_discard_gfn_range(struct kvm_vcpu *vcpu, gfn_t gfn_start, gfn_t gfn_end) 22 22 {
+537 -362
arch/s390/kvm/gaccess.c
··· 11 11 #include <linux/err.h> 12 12 #include <linux/pgtable.h> 13 13 #include <linux/bitfield.h> 14 + #include <linux/kvm_host.h> 15 + #include <linux/kvm_types.h> 16 + #include <asm/diag.h> 14 17 #include <asm/access-regs.h> 15 18 #include <asm/fault.h> 16 - #include <asm/gmap.h> 17 19 #include <asm/dat-bits.h> 18 20 #include "kvm-s390.h" 21 + #include "dat.h" 22 + #include "gmap.h" 19 23 #include "gaccess.h" 24 + #include "faultin.h" 20 25 21 26 #define GMAP_SHADOW_FAKE_TABLE 1ULL 27 + 28 + union dat_table_entry { 29 + unsigned long val; 30 + union region1_table_entry pgd; 31 + union region2_table_entry p4d; 32 + union region3_table_entry pud; 33 + union segment_table_entry pmd; 34 + union page_table_entry pte; 35 + }; 36 + 37 + #define WALK_N_ENTRIES 7 38 + #define LEVEL_MEM -2 39 + struct pgtwalk { 40 + struct guest_fault raw_entries[WALK_N_ENTRIES]; 41 + gpa_t last_addr; 42 + int level; 43 + bool p; 44 + }; 45 + 46 + static inline struct guest_fault *get_entries(struct pgtwalk *w) 47 + { 48 + return w->raw_entries - LEVEL_MEM; 49 + } 22 50 23 51 /* 24 52 * raddress union which will contain the result (real or absolute address) ··· 107 79 unsigned long ald : 32; 108 80 unsigned long astesn : 32; 109 81 /* .. more fields there */ 82 + }; 83 + 84 + union oac { 85 + unsigned int val; 86 + struct { 87 + struct { 88 + unsigned short key : 4; 89 + unsigned short : 4; 90 + unsigned short as : 2; 91 + unsigned short : 4; 92 + unsigned short k : 1; 93 + unsigned short a : 1; 94 + } oac1; 95 + struct { 96 + unsigned short key : 4; 97 + unsigned short : 4; 98 + unsigned short as : 2; 99 + unsigned short : 4; 100 + unsigned short k : 1; 101 + unsigned short a : 1; 102 + } oac2; 103 + }; 110 104 }; 111 105 112 106 int ipte_lock_held(struct kvm *kvm) ··· 653 603 static int vm_check_access_key_gpa(struct kvm *kvm, u8 access_key, 654 604 enum gacc_mode mode, gpa_t gpa) 655 605 { 656 - u8 storage_key, access_control; 657 - bool fetch_protected; 658 - unsigned long hva; 606 + union skey storage_key; 659 607 int r; 660 608 661 - if (access_key == 0) 662 - return 0; 663 - 664 - hva = gfn_to_hva(kvm, gpa_to_gfn(gpa)); 665 - if (kvm_is_error_hva(hva)) 666 - return PGM_ADDRESSING; 667 - 668 - mmap_read_lock(current->mm); 669 - r = get_guest_storage_key(current->mm, hva, &storage_key); 670 - mmap_read_unlock(current->mm); 609 + scoped_guard(read_lock, &kvm->mmu_lock) 610 + r = dat_get_storage_key(kvm->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); 671 611 if (r) 672 612 return r; 673 - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); 674 - if (access_control == access_key) 613 + if (access_key == 0 || storage_key.acc == access_key) 675 614 return 0; 676 - fetch_protected = storage_key & _PAGE_FP_BIT; 677 - if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !fetch_protected) 615 + if ((mode == GACC_FETCH || mode == GACC_IFETCH) && !storage_key.fp) 678 616 return 0; 679 617 return PGM_PROTECTION; 680 618 } ··· 705 667 enum gacc_mode mode, union asce asce, gpa_t gpa, 706 668 unsigned long ga, unsigned int len) 707 669 { 708 - u8 storage_key, access_control; 709 - unsigned long hva; 670 + union skey storage_key; 710 671 int r; 711 672 712 673 /* access key 0 matches any storage key -> allow */ ··· 715 678 * caller needs to ensure that gfn is accessible, so we can 716 679 * assume that this cannot fail 717 680 */ 718 - hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gpa)); 719 - mmap_read_lock(current->mm); 720 - r = get_guest_storage_key(current->mm, hva, &storage_key); 721 - mmap_read_unlock(current->mm); 681 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) 682 + r = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gpa), &storage_key); 722 683 if (r) 723 684 return r; 724 - access_control = FIELD_GET(_PAGE_ACC_BITS, storage_key); 725 685 /* access key matches storage key -> allow */ 726 - if (access_control == access_key) 686 + if (storage_key.acc == access_key) 727 687 return 0; 728 688 if (mode == GACC_FETCH || mode == GACC_IFETCH) { 729 689 /* it is a fetch and fetch protection is off -> allow */ 730 - if (!(storage_key & _PAGE_FP_BIT)) 690 + if (!storage_key.fp) 731 691 return 0; 732 692 if (fetch_prot_override_applicable(vcpu, mode, asce) && 733 693 fetch_prot_override_applies(ga, len)) 734 694 return 0; 735 695 } 736 696 if (storage_prot_override_applicable(vcpu) && 737 - storage_prot_override_applies(access_control)) 697 + storage_prot_override_applies(storage_key.acc)) 738 698 return 0; 739 699 return PGM_PROTECTION; 740 700 } ··· 831 797 return rc; 832 798 } 833 799 834 - static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, 835 - void *data, unsigned int len, u8 access_key) 800 + static int mvcos_key(void *to, const void *from, unsigned long size, u8 dst_key, u8 src_key) 836 801 { 837 - struct kvm_memory_slot *slot; 838 - bool writable; 839 - gfn_t gfn; 840 - hva_t hva; 802 + union oac spec = { 803 + .oac1.key = dst_key, 804 + .oac1.k = !!dst_key, 805 + .oac2.key = src_key, 806 + .oac2.k = !!src_key, 807 + }; 808 + int exception = PGM_PROTECTION; 809 + 810 + asm_inline volatile( 811 + " lr %%r0,%[spec]\n" 812 + "0: mvcos %[to],%[from],%[size]\n" 813 + "1: lhi %[exc],0\n" 814 + "2:\n" 815 + EX_TABLE(0b, 2b) 816 + EX_TABLE(1b, 2b) 817 + : [size] "+d" (size), [to] "=Q" (*(char *)to), [exc] "+d" (exception) 818 + : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) 819 + : "memory", "cc", "0"); 820 + return exception; 821 + } 822 + 823 + struct acc_page_key_context { 824 + void *data; 825 + int exception; 826 + unsigned short offset; 827 + unsigned short len; 828 + bool store; 829 + u8 access_key; 830 + }; 831 + 832 + static void _access_guest_page_with_key_gpa(struct guest_fault *f) 833 + { 834 + struct acc_page_key_context *context = f->priv; 835 + void *ptr; 836 + int r; 837 + 838 + ptr = __va(PFN_PHYS(f->pfn) | context->offset); 839 + 840 + if (context->store) 841 + r = mvcos_key(ptr, context->data, context->len, context->access_key, 0); 842 + else 843 + r = mvcos_key(context->data, ptr, context->len, 0, context->access_key); 844 + 845 + context->exception = r; 846 + } 847 + 848 + static int access_guest_page_with_key_gpa(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa, 849 + void *data, unsigned int len, u8 acc) 850 + { 851 + struct acc_page_key_context context = { 852 + .offset = offset_in_page(gpa), 853 + .len = len, 854 + .data = data, 855 + .access_key = acc, 856 + .store = mode == GACC_STORE, 857 + }; 858 + struct guest_fault fault = { 859 + .gfn = gpa_to_gfn(gpa), 860 + .priv = &context, 861 + .write_attempt = mode == GACC_STORE, 862 + .callback = _access_guest_page_with_key_gpa, 863 + }; 841 864 int rc; 842 865 843 - gfn = gpa_to_gfn(gpa); 844 - slot = gfn_to_memslot(kvm, gfn); 845 - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); 866 + if (KVM_BUG_ON((len + context.offset) > PAGE_SIZE, kvm)) 867 + return -EINVAL; 846 868 847 - if (kvm_is_error_hva(hva)) 848 - return PGM_ADDRESSING; 849 - /* 850 - * Check if it's a ro memslot, even tho that can't occur (they're unsupported). 851 - * Don't try to actually handle that case. 852 - */ 853 - if (!writable && mode == GACC_STORE) 854 - return -EOPNOTSUPP; 855 - hva += offset_in_page(gpa); 856 - if (mode == GACC_STORE) 857 - rc = copy_to_user_key((void __user *)hva, data, len, access_key); 858 - else 859 - rc = copy_from_user_key(data, (void __user *)hva, len, access_key); 869 + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); 860 870 if (rc) 861 - return PGM_PROTECTION; 862 - if (mode == GACC_STORE) 863 - mark_page_dirty_in_slot(kvm, slot, gfn); 864 - return 0; 871 + return rc; 872 + return context.exception; 865 873 } 866 874 867 875 int access_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, void *data, ··· 1027 951 } 1028 952 1029 953 /** 954 + * __cmpxchg_with_key() - Perform cmpxchg, honoring storage keys. 955 + * @ptr: Address of value to compare to *@old and exchange with 956 + * @new. Must be aligned to @size. 957 + * @old: Old value. Compared to the content pointed to by @ptr in order to 958 + * determine if the exchange occurs. The old value read from *@ptr is 959 + * written here. 960 + * @new: New value to place at *@ptr. 961 + * @size: Size of the operation in bytes, may only be a power of two up to 16. 962 + * @access_key: Access key to use for checking storage key protection. 963 + * 964 + * Perform a cmpxchg on guest memory, honoring storage key protection. 965 + * @access_key alone determines how key checking is performed, neither 966 + * storage-protection-override nor fetch-protection-override apply. 967 + * In case of an exception *@uval is set to zero. 968 + * 969 + * Return: 970 + * * %0: cmpxchg executed successfully 971 + * * %1: cmpxchg executed unsuccessfully 972 + * * %PGM_PROTECTION: an exception happened when trying to access *@ptr 973 + * * %-EAGAIN: maxed out number of retries (byte and short only) 974 + * * %-EINVAL: invalid value for @size 975 + */ 976 + static int __cmpxchg_with_key(union kvm_s390_quad *ptr, union kvm_s390_quad *old, 977 + union kvm_s390_quad new, int size, u8 access_key) 978 + { 979 + union kvm_s390_quad tmp = { .sixteen = 0 }; 980 + int rc; 981 + 982 + /* 983 + * The cmpxchg_key macro depends on the type of "old", so we need 984 + * a case for each valid length and get some code duplication as long 985 + * as we don't introduce a new macro. 986 + */ 987 + switch (size) { 988 + case 1: 989 + rc = __cmpxchg_key1(&ptr->one, &tmp.one, old->one, new.one, access_key); 990 + break; 991 + case 2: 992 + rc = __cmpxchg_key2(&ptr->two, &tmp.two, old->two, new.two, access_key); 993 + break; 994 + case 4: 995 + rc = __cmpxchg_key4(&ptr->four, &tmp.four, old->four, new.four, access_key); 996 + break; 997 + case 8: 998 + rc = __cmpxchg_key8(&ptr->eight, &tmp.eight, old->eight, new.eight, access_key); 999 + break; 1000 + case 16: 1001 + rc = __cmpxchg_key16(&ptr->sixteen, &tmp.sixteen, old->sixteen, new.sixteen, 1002 + access_key); 1003 + break; 1004 + default: 1005 + return -EINVAL; 1006 + } 1007 + if (!rc && memcmp(&tmp, old, size)) 1008 + rc = 1; 1009 + *old = tmp; 1010 + /* 1011 + * Assume that the fault is caused by protection, either key protection 1012 + * or user page write protection. 1013 + */ 1014 + if (rc == -EFAULT) 1015 + rc = PGM_PROTECTION; 1016 + return rc; 1017 + } 1018 + 1019 + struct cmpxchg_key_context { 1020 + union kvm_s390_quad new; 1021 + union kvm_s390_quad *old; 1022 + int exception; 1023 + unsigned short offset; 1024 + u8 access_key; 1025 + u8 len; 1026 + }; 1027 + 1028 + static void _cmpxchg_guest_abs_with_key(struct guest_fault *f) 1029 + { 1030 + struct cmpxchg_key_context *context = f->priv; 1031 + 1032 + context->exception = __cmpxchg_with_key(__va(PFN_PHYS(f->pfn) | context->offset), 1033 + context->old, context->new, context->len, 1034 + context->access_key); 1035 + } 1036 + 1037 + /** 1030 1038 * cmpxchg_guest_abs_with_key() - Perform cmpxchg on guest absolute address. 1031 1039 * @kvm: Virtual machine instance. 1032 1040 * @gpa: Absolute guest address of the location to be changed. 1033 1041 * @len: Operand length of the cmpxchg, required: 1 <= len <= 16. Providing a 1034 1042 * non power of two will result in failure. 1035 - * @old_addr: Pointer to old value. If the location at @gpa contains this value, 1036 - * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() 1037 - * *@old_addr contains the value at @gpa before the attempt to 1038 - * exchange the value. 1043 + * @old: Pointer to old value. If the location at @gpa contains this value, 1044 + * the exchange will succeed. After calling cmpxchg_guest_abs_with_key() 1045 + * *@old contains the value at @gpa before the attempt to 1046 + * exchange the value. 1039 1047 * @new: The value to place at @gpa. 1040 1048 * @acc: The access key to use for the guest access. 1041 1049 * @success: output value indicating if an exchange occurred. ··· 1134 974 * * -EAGAIN: transient failure (len 1 or 2) 1135 975 * * -EOPNOTSUPP: read-only memslot (should never occur) 1136 976 */ 1137 - int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr, 977 + int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, 1138 978 union kvm_s390_quad new, u8 acc, bool *success) 1139 979 { 1140 - gfn_t gfn = gpa_to_gfn(gpa); 1141 - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1142 - bool writable; 1143 - hva_t hva; 1144 - int ret; 980 + struct cmpxchg_key_context context = { 981 + .old = old, 982 + .new = new, 983 + .offset = offset_in_page(gpa), 984 + .len = len, 985 + .access_key = acc, 986 + }; 987 + struct guest_fault fault = { 988 + .gfn = gpa_to_gfn(gpa), 989 + .priv = &context, 990 + .write_attempt = true, 991 + .callback = _cmpxchg_guest_abs_with_key, 992 + }; 993 + int rc; 1145 994 1146 - if (!IS_ALIGNED(gpa, len)) 995 + lockdep_assert_held(&kvm->srcu); 996 + 997 + if (len > 16 || !IS_ALIGNED(gpa, len)) 1147 998 return -EINVAL; 1148 999 1149 - hva = gfn_to_hva_memslot_prot(slot, gfn, &writable); 1150 - if (kvm_is_error_hva(hva)) 1151 - return PGM_ADDRESSING; 1152 - /* 1153 - * Check if it's a read-only memslot, even though that cannot occur 1154 - * since those are unsupported. 1155 - * Don't try to actually handle that case. 1156 - */ 1157 - if (!writable) 1158 - return -EOPNOTSUPP; 1159 - 1160 - hva += offset_in_page(gpa); 1161 - /* 1162 - * The cmpxchg_user_key macro depends on the type of "old", so we need 1163 - * a case for each valid length and get some code duplication as long 1164 - * as we don't introduce a new macro. 1165 - */ 1166 - switch (len) { 1167 - case 1: { 1168 - u8 old; 1169 - 1170 - ret = cmpxchg_user_key((u8 __user *)hva, &old, old_addr->one, new.one, acc); 1171 - *success = !ret && old == old_addr->one; 1172 - old_addr->one = old; 1173 - break; 1174 - } 1175 - case 2: { 1176 - u16 old; 1177 - 1178 - ret = cmpxchg_user_key((u16 __user *)hva, &old, old_addr->two, new.two, acc); 1179 - *success = !ret && old == old_addr->two; 1180 - old_addr->two = old; 1181 - break; 1182 - } 1183 - case 4: { 1184 - u32 old; 1185 - 1186 - ret = cmpxchg_user_key((u32 __user *)hva, &old, old_addr->four, new.four, acc); 1187 - *success = !ret && old == old_addr->four; 1188 - old_addr->four = old; 1189 - break; 1190 - } 1191 - case 8: { 1192 - u64 old; 1193 - 1194 - ret = cmpxchg_user_key((u64 __user *)hva, &old, old_addr->eight, new.eight, acc); 1195 - *success = !ret && old == old_addr->eight; 1196 - old_addr->eight = old; 1197 - break; 1198 - } 1199 - case 16: { 1200 - __uint128_t old; 1201 - 1202 - ret = cmpxchg_user_key((__uint128_t __user *)hva, &old, old_addr->sixteen, 1203 - new.sixteen, acc); 1204 - *success = !ret && old == old_addr->sixteen; 1205 - old_addr->sixteen = old; 1206 - break; 1207 - } 1208 - default: 1209 - return -EINVAL; 1210 - } 1211 - if (*success) 1212 - mark_page_dirty_in_slot(kvm, slot, gfn); 1213 - /* 1214 - * Assume that the fault is caused by protection, either key protection 1215 - * or user page write protection. 1216 - */ 1217 - if (ret == -EFAULT) 1218 - ret = PGM_PROTECTION; 1219 - return ret; 1000 + rc = kvm_s390_faultin_gfn(NULL, kvm, &fault); 1001 + if (rc) 1002 + return rc; 1003 + *success = !context.exception; 1004 + if (context.exception == 1) 1005 + return 0; 1006 + return context.exception; 1220 1007 } 1221 1008 1222 1009 /** ··· 1265 1158 } 1266 1159 1267 1160 /** 1268 - * kvm_s390_shadow_tables - walk the guest page table and create shadow tables 1269 - * @sg: pointer to the shadow guest address space structure 1270 - * @saddr: faulting address in the shadow gmap 1271 - * @pgt: pointer to the beginning of the page table for the given address if 1272 - * successful (return value 0), or to the first invalid DAT entry in 1273 - * case of exceptions (return value > 0) 1274 - * @dat_protection: referenced memory is write protected 1275 - * @fake: pgt references contiguous guest memory block, not a pgtable 1161 + * walk_guest_tables() - Walk the guest page table and pin the dat tables. 1162 + * @sg: Pointer to the shadow guest address space structure. 1163 + * @saddr: Faulting address in the shadow gmap. 1164 + * @w: Will be filled with information on the pinned pages. 1165 + * @wr: Wndicates a write access if true. 1166 + * 1167 + * Return: 1168 + * * %0 in case of success, 1169 + * * a PIC code > 0 in case the address translation fails 1170 + * * an error code < 0 if other errors happen in the host 1276 1171 */ 1277 - static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr, 1278 - unsigned long *pgt, int *dat_protection, 1279 - int *fake) 1172 + static int walk_guest_tables(struct gmap *sg, unsigned long saddr, struct pgtwalk *w, bool wr) 1280 1173 { 1281 - struct kvm *kvm; 1282 - struct gmap *parent; 1283 - union asce asce; 1174 + struct gmap *parent = sg->parent; 1175 + struct guest_fault *entries; 1176 + union dat_table_entry table; 1284 1177 union vaddress vaddr; 1285 1178 unsigned long ptr; 1179 + struct kvm *kvm; 1180 + union asce asce; 1286 1181 int rc; 1287 1182 1288 - *fake = 0; 1289 - *dat_protection = 0; 1290 - kvm = sg->private; 1291 - parent = sg->parent; 1183 + kvm = parent->kvm; 1184 + asce = sg->guest_asce; 1185 + entries = get_entries(w); 1186 + 1187 + w->level = LEVEL_MEM; 1188 + w->last_addr = saddr; 1189 + if (asce.r) 1190 + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, gpa_to_gfn(saddr), false); 1191 + 1292 1192 vaddr.addr = saddr; 1293 - asce.val = sg->orig_asce; 1294 1193 ptr = asce.rsto * PAGE_SIZE; 1295 - if (asce.r) { 1296 - *fake = 1; 1297 - ptr = 0; 1298 - asce.dt = ASCE_TYPE_REGION1; 1299 - } 1194 + 1195 + if (!asce_contains_gfn(asce, gpa_to_gfn(saddr))) 1196 + return PGM_ASCE_TYPE; 1300 1197 switch (asce.dt) { 1301 1198 case ASCE_TYPE_REGION1: 1302 - if (vaddr.rfx01 > asce.tl && !*fake) 1199 + if (vaddr.rfx01 > asce.tl) 1303 1200 return PGM_REGION_FIRST_TRANS; 1304 1201 break; 1305 1202 case ASCE_TYPE_REGION2: 1306 - if (vaddr.rfx) 1307 - return PGM_ASCE_TYPE; 1308 1203 if (vaddr.rsx01 > asce.tl) 1309 1204 return PGM_REGION_SECOND_TRANS; 1310 1205 break; 1311 1206 case ASCE_TYPE_REGION3: 1312 - if (vaddr.rfx || vaddr.rsx) 1313 - return PGM_ASCE_TYPE; 1314 1207 if (vaddr.rtx01 > asce.tl) 1315 1208 return PGM_REGION_THIRD_TRANS; 1316 1209 break; 1317 1210 case ASCE_TYPE_SEGMENT: 1318 - if (vaddr.rfx || vaddr.rsx || vaddr.rtx) 1319 - return PGM_ASCE_TYPE; 1320 1211 if (vaddr.sx01 > asce.tl) 1321 1212 return PGM_SEGMENT_TRANSLATION; 1322 1213 break; 1323 1214 } 1324 1215 1216 + w->level = asce.dt; 1325 1217 switch (asce.dt) { 1326 - case ASCE_TYPE_REGION1: { 1327 - union region1_table_entry rfte; 1328 - 1329 - if (*fake) { 1330 - ptr += vaddr.rfx * _REGION1_SIZE; 1331 - rfte.val = ptr; 1332 - goto shadow_r2t; 1333 - } 1334 - *pgt = ptr + vaddr.rfx * 8; 1335 - rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val); 1218 + case ASCE_TYPE_REGION1: 1219 + w->last_addr = ptr + vaddr.rfx * 8; 1220 + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, 1221 + w->last_addr, &table.val); 1336 1222 if (rc) 1337 1223 return rc; 1338 - if (rfte.i) 1224 + if (table.pgd.i) 1339 1225 return PGM_REGION_FIRST_TRANS; 1340 - if (rfte.tt != TABLE_TYPE_REGION1) 1226 + if (table.pgd.tt != TABLE_TYPE_REGION1) 1341 1227 return PGM_TRANSLATION_SPEC; 1342 - if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl) 1228 + if (vaddr.rsx01 < table.pgd.tf || vaddr.rsx01 > table.pgd.tl) 1343 1229 return PGM_REGION_SECOND_TRANS; 1344 1230 if (sg->edat_level >= 1) 1345 - *dat_protection |= rfte.p; 1346 - ptr = rfte.rto * PAGE_SIZE; 1347 - shadow_r2t: 1348 - rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); 1349 - if (rc) 1350 - return rc; 1351 - kvm->stat.gmap_shadow_r1_entry++; 1352 - } 1231 + w->p |= table.pgd.p; 1232 + ptr = table.pgd.rto * PAGE_SIZE; 1233 + w->level--; 1353 1234 fallthrough; 1354 - case ASCE_TYPE_REGION2: { 1355 - union region2_table_entry rste; 1356 - 1357 - if (*fake) { 1358 - ptr += vaddr.rsx * _REGION2_SIZE; 1359 - rste.val = ptr; 1360 - goto shadow_r3t; 1361 - } 1362 - *pgt = ptr + vaddr.rsx * 8; 1363 - rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val); 1235 + case ASCE_TYPE_REGION2: 1236 + w->last_addr = ptr + vaddr.rsx * 8; 1237 + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, 1238 + w->last_addr, &table.val); 1364 1239 if (rc) 1365 1240 return rc; 1366 - if (rste.i) 1241 + if (table.p4d.i) 1367 1242 return PGM_REGION_SECOND_TRANS; 1368 - if (rste.tt != TABLE_TYPE_REGION2) 1243 + if (table.p4d.tt != TABLE_TYPE_REGION2) 1369 1244 return PGM_TRANSLATION_SPEC; 1370 - if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl) 1245 + if (vaddr.rtx01 < table.p4d.tf || vaddr.rtx01 > table.p4d.tl) 1371 1246 return PGM_REGION_THIRD_TRANS; 1372 1247 if (sg->edat_level >= 1) 1373 - *dat_protection |= rste.p; 1374 - ptr = rste.rto * PAGE_SIZE; 1375 - shadow_r3t: 1376 - rste.p |= *dat_protection; 1377 - rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); 1378 - if (rc) 1379 - return rc; 1380 - kvm->stat.gmap_shadow_r2_entry++; 1381 - } 1248 + w->p |= table.p4d.p; 1249 + ptr = table.p4d.rto * PAGE_SIZE; 1250 + w->level--; 1382 1251 fallthrough; 1383 - case ASCE_TYPE_REGION3: { 1384 - union region3_table_entry rtte; 1385 - 1386 - if (*fake) { 1387 - ptr += vaddr.rtx * _REGION3_SIZE; 1388 - rtte.val = ptr; 1389 - goto shadow_sgt; 1390 - } 1391 - *pgt = ptr + vaddr.rtx * 8; 1392 - rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val); 1252 + case ASCE_TYPE_REGION3: 1253 + w->last_addr = ptr + vaddr.rtx * 8; 1254 + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, 1255 + w->last_addr, &table.val); 1393 1256 if (rc) 1394 1257 return rc; 1395 - if (rtte.i) 1258 + if (table.pud.i) 1396 1259 return PGM_REGION_THIRD_TRANS; 1397 - if (rtte.tt != TABLE_TYPE_REGION3) 1260 + if (table.pud.tt != TABLE_TYPE_REGION3) 1398 1261 return PGM_TRANSLATION_SPEC; 1399 - if (rtte.cr && asce.p && sg->edat_level >= 2) 1262 + if (table.pud.cr && asce.p && sg->edat_level >= 2) 1400 1263 return PGM_TRANSLATION_SPEC; 1401 - if (rtte.fc && sg->edat_level >= 2) { 1402 - *dat_protection |= rtte.fc0.p; 1403 - *fake = 1; 1404 - ptr = rtte.fc1.rfaa * _REGION3_SIZE; 1405 - rtte.val = ptr; 1406 - goto shadow_sgt; 1407 - } 1408 - if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl) 1409 - return PGM_SEGMENT_TRANSLATION; 1410 1264 if (sg->edat_level >= 1) 1411 - *dat_protection |= rtte.fc0.p; 1412 - ptr = rtte.fc0.sto * PAGE_SIZE; 1413 - shadow_sgt: 1414 - rtte.fc0.p |= *dat_protection; 1415 - rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); 1416 - if (rc) 1417 - return rc; 1418 - kvm->stat.gmap_shadow_r3_entry++; 1419 - } 1420 - fallthrough; 1421 - case ASCE_TYPE_SEGMENT: { 1422 - union segment_table_entry ste; 1423 - 1424 - if (*fake) { 1425 - ptr += vaddr.sx * _SEGMENT_SIZE; 1426 - ste.val = ptr; 1427 - goto shadow_pgt; 1265 + w->p |= table.pud.p; 1266 + if (table.pud.fc && sg->edat_level >= 2) { 1267 + table.val = u64_replace_bits(table.val, saddr, ~_REGION3_MASK); 1268 + goto edat_applies; 1428 1269 } 1429 - *pgt = ptr + vaddr.sx * 8; 1430 - rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val); 1431 - if (rc) 1432 - return rc; 1433 - if (ste.i) 1270 + if (vaddr.sx01 < table.pud.fc0.tf || vaddr.sx01 > table.pud.fc0.tl) 1434 1271 return PGM_SEGMENT_TRANSLATION; 1435 - if (ste.tt != TABLE_TYPE_SEGMENT) 1436 - return PGM_TRANSLATION_SPEC; 1437 - if (ste.cs && asce.p) 1438 - return PGM_TRANSLATION_SPEC; 1439 - *dat_protection |= ste.fc0.p; 1440 - if (ste.fc && sg->edat_level >= 1) { 1441 - *fake = 1; 1442 - ptr = ste.fc1.sfaa * _SEGMENT_SIZE; 1443 - ste.val = ptr; 1444 - goto shadow_pgt; 1445 - } 1446 - ptr = ste.fc0.pto * (PAGE_SIZE / 2); 1447 - shadow_pgt: 1448 - ste.fc0.p |= *dat_protection; 1449 - rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake); 1272 + ptr = table.pud.fc0.sto * PAGE_SIZE; 1273 + w->level--; 1274 + fallthrough; 1275 + case ASCE_TYPE_SEGMENT: 1276 + w->last_addr = ptr + vaddr.sx * 8; 1277 + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, 1278 + w->last_addr, &table.val); 1450 1279 if (rc) 1451 1280 return rc; 1452 - kvm->stat.gmap_shadow_sg_entry++; 1281 + if (table.pmd.i) 1282 + return PGM_SEGMENT_TRANSLATION; 1283 + if (table.pmd.tt != TABLE_TYPE_SEGMENT) 1284 + return PGM_TRANSLATION_SPEC; 1285 + if (table.pmd.cs && asce.p) 1286 + return PGM_TRANSLATION_SPEC; 1287 + w->p |= table.pmd.p; 1288 + if (table.pmd.fc && sg->edat_level >= 1) { 1289 + table.val = u64_replace_bits(table.val, saddr, ~_SEGMENT_MASK); 1290 + goto edat_applies; 1291 + } 1292 + ptr = table.pmd.fc0.pto * (PAGE_SIZE / 2); 1293 + w->level--; 1453 1294 } 1454 - } 1455 - /* Return the parent address of the page table */ 1456 - *pgt = ptr; 1295 + w->last_addr = ptr + vaddr.px * 8; 1296 + rc = kvm_s390_get_guest_page_and_read_gpa(kvm, entries + w->level, 1297 + w->last_addr, &table.val); 1298 + if (rc) 1299 + return rc; 1300 + if (table.pte.i) 1301 + return PGM_PAGE_TRANSLATION; 1302 + if (table.pte.z) 1303 + return PGM_TRANSLATION_SPEC; 1304 + w->p |= table.pte.p; 1305 + edat_applies: 1306 + if (wr && w->p) 1307 + return PGM_PROTECTION; 1308 + 1309 + return kvm_s390_get_guest_page(kvm, entries + LEVEL_MEM, table.pte.pfra, wr); 1310 + } 1311 + 1312 + static int _do_shadow_pte(struct gmap *sg, gpa_t raddr, union pte *ptep_h, union pte *ptep, 1313 + struct guest_fault *f, bool p) 1314 + { 1315 + union pgste pgste; 1316 + union pte newpte; 1317 + int rc; 1318 + 1319 + lockdep_assert_held(&sg->kvm->mmu_lock); 1320 + lockdep_assert_held(&sg->parent->children_lock); 1321 + 1322 + scoped_guard(spinlock, &sg->host_to_rmap_lock) 1323 + rc = gmap_insert_rmap(sg, f->gfn, gpa_to_gfn(raddr), TABLE_TYPE_PAGE_TABLE); 1324 + if (rc) 1325 + return rc; 1326 + 1327 + pgste = pgste_get_lock(ptep_h); 1328 + newpte = _pte(f->pfn, f->writable, !p, 0); 1329 + newpte.s.d |= ptep->s.d; 1330 + newpte.s.sd |= ptep->s.sd; 1331 + newpte.h.p &= ptep->h.p; 1332 + pgste = _gmap_ptep_xchg(sg->parent, ptep_h, newpte, pgste, f->gfn, false); 1333 + pgste.vsie_notif = 1; 1334 + pgste_set_unlock(ptep_h, pgste); 1335 + 1336 + newpte = _pte(f->pfn, 0, !p, 0); 1337 + pgste = pgste_get_lock(ptep); 1338 + pgste = __dat_ptep_xchg(ptep, pgste, newpte, gpa_to_gfn(raddr), sg->asce, uses_skeys(sg)); 1339 + pgste_set_unlock(ptep, pgste); 1340 + 1457 1341 return 0; 1458 1342 } 1459 1343 1460 - /** 1461 - * shadow_pgt_lookup() - find a shadow page table 1462 - * @sg: pointer to the shadow guest address space structure 1463 - * @saddr: the address in the shadow aguest address space 1464 - * @pgt: parent gmap address of the page table to get shadowed 1465 - * @dat_protection: if the pgtable is marked as protected by dat 1466 - * @fake: pgt references contiguous guest memory block, not a pgtable 1467 - * 1468 - * Returns 0 if the shadow page table was found and -EAGAIN if the page 1469 - * table was not found. 1470 - * 1471 - * Called with sg->mm->mmap_lock in read. 1472 - */ 1473 - static int shadow_pgt_lookup(struct gmap *sg, unsigned long saddr, unsigned long *pgt, 1474 - int *dat_protection, int *fake) 1344 + static int _do_shadow_crste(struct gmap *sg, gpa_t raddr, union crste *host, union crste *table, 1345 + struct guest_fault *f, bool p) 1475 1346 { 1476 - unsigned long pt_index; 1477 - unsigned long *table; 1478 - struct page *page; 1347 + union crste newcrste; 1348 + gfn_t gfn; 1479 1349 int rc; 1480 1350 1481 - spin_lock(&sg->guest_table_lock); 1482 - table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */ 1483 - if (table && !(*table & _SEGMENT_ENTRY_INVALID)) { 1484 - /* Shadow page tables are full pages (pte+pgste) */ 1485 - page = pfn_to_page(*table >> PAGE_SHIFT); 1486 - pt_index = gmap_pgste_get_pgt_addr(page_to_virt(page)); 1487 - *pgt = pt_index & ~GMAP_SHADOW_FAKE_TABLE; 1488 - *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT); 1489 - *fake = !!(pt_index & GMAP_SHADOW_FAKE_TABLE); 1490 - rc = 0; 1491 - } else { 1492 - rc = -EAGAIN; 1351 + lockdep_assert_held(&sg->kvm->mmu_lock); 1352 + lockdep_assert_held(&sg->parent->children_lock); 1353 + 1354 + gfn = f->gfn & gpa_to_gfn(is_pmd(*table) ? _SEGMENT_MASK : _REGION3_MASK); 1355 + scoped_guard(spinlock, &sg->host_to_rmap_lock) 1356 + rc = gmap_insert_rmap(sg, gfn, gpa_to_gfn(raddr), host->h.tt); 1357 + if (rc) 1358 + return rc; 1359 + 1360 + newcrste = _crste_fc1(f->pfn, host->h.tt, f->writable, !p); 1361 + newcrste.s.fc1.d |= host->s.fc1.d; 1362 + newcrste.s.fc1.sd |= host->s.fc1.sd; 1363 + newcrste.h.p &= host->h.p; 1364 + newcrste.s.fc1.vsie_notif = 1; 1365 + newcrste.s.fc1.prefix_notif = host->s.fc1.prefix_notif; 1366 + _gmap_crstep_xchg(sg->parent, host, newcrste, f->gfn, false); 1367 + 1368 + newcrste = _crste_fc1(f->pfn, host->h.tt, 0, !p); 1369 + dat_crstep_xchg(table, newcrste, gpa_to_gfn(raddr), sg->asce); 1370 + return 0; 1371 + } 1372 + 1373 + static int _gaccess_do_shadow(struct kvm_s390_mmu_cache *mc, struct gmap *sg, 1374 + unsigned long saddr, struct pgtwalk *w) 1375 + { 1376 + struct guest_fault *entries; 1377 + int flags, i, hl, gl, l, rc; 1378 + union crste *table, *host; 1379 + union pte *ptep, *ptep_h; 1380 + 1381 + lockdep_assert_held(&sg->kvm->mmu_lock); 1382 + lockdep_assert_held(&sg->parent->children_lock); 1383 + 1384 + entries = get_entries(w); 1385 + ptep_h = NULL; 1386 + ptep = NULL; 1387 + 1388 + rc = dat_entry_walk(NULL, gpa_to_gfn(saddr), sg->asce, DAT_WALK_ANY, TABLE_TYPE_PAGE_TABLE, 1389 + &table, &ptep); 1390 + if (rc) 1391 + return rc; 1392 + 1393 + /* A race occourred. The shadow mapping is already valid, nothing to do */ 1394 + if ((ptep && !ptep->h.i) || (!ptep && crste_leaf(*table))) 1395 + return 0; 1396 + 1397 + gl = get_level(table, ptep); 1398 + 1399 + /* 1400 + * Skip levels that are already protected. For each level, protect 1401 + * only the page containing the entry, not the whole table. 1402 + */ 1403 + for (i = gl ; i >= w->level; i--) { 1404 + rc = gmap_protect_rmap(mc, sg, entries[i - 1].gfn, gpa_to_gfn(saddr), 1405 + entries[i - 1].pfn, i, entries[i - 1].writable); 1406 + if (rc) 1407 + return rc; 1493 1408 } 1494 - spin_unlock(&sg->guest_table_lock); 1409 + 1410 + rc = dat_entry_walk(NULL, entries[LEVEL_MEM].gfn, sg->parent->asce, DAT_WALK_LEAF, 1411 + TABLE_TYPE_PAGE_TABLE, &host, &ptep_h); 1412 + if (rc) 1413 + return rc; 1414 + 1415 + hl = get_level(host, ptep_h); 1416 + /* Get the smallest granularity */ 1417 + l = min3(gl, hl, w->level); 1418 + 1419 + flags = DAT_WALK_SPLIT_ALLOC | (uses_skeys(sg->parent) ? DAT_WALK_USES_SKEYS : 0); 1420 + /* If necessary, create the shadow mapping */ 1421 + if (l < gl) { 1422 + rc = dat_entry_walk(mc, gpa_to_gfn(saddr), sg->asce, flags, l, &table, &ptep); 1423 + if (rc) 1424 + return rc; 1425 + } 1426 + if (l < hl) { 1427 + rc = dat_entry_walk(mc, entries[LEVEL_MEM].gfn, sg->parent->asce, 1428 + flags, l, &host, &ptep_h); 1429 + if (rc) 1430 + return rc; 1431 + } 1432 + 1433 + if (KVM_BUG_ON(l > TABLE_TYPE_REGION3, sg->kvm)) 1434 + return -EFAULT; 1435 + if (l == TABLE_TYPE_PAGE_TABLE) 1436 + return _do_shadow_pte(sg, saddr, ptep_h, ptep, entries + LEVEL_MEM, w->p); 1437 + return _do_shadow_crste(sg, saddr, host, table, entries + LEVEL_MEM, w->p); 1438 + } 1439 + 1440 + static inline int _gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, 1441 + unsigned long seq, struct pgtwalk *walk) 1442 + { 1443 + struct gmap *parent; 1444 + int rc; 1445 + 1446 + if (kvm_s390_array_needs_retry_unsafe(vcpu->kvm, seq, walk->raw_entries)) 1447 + return -EAGAIN; 1448 + again: 1449 + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); 1450 + if (rc) 1451 + return rc; 1452 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { 1453 + if (kvm_s390_array_needs_retry_safe(vcpu->kvm, seq, walk->raw_entries)) 1454 + return -EAGAIN; 1455 + parent = READ_ONCE(sg->parent); 1456 + if (!parent) 1457 + return -EAGAIN; 1458 + scoped_guard(spinlock, &parent->children_lock) { 1459 + if (READ_ONCE(sg->parent) != parent) 1460 + return -EAGAIN; 1461 + rc = _gaccess_do_shadow(vcpu->arch.mc, sg, saddr, walk); 1462 + } 1463 + if (rc == -ENOMEM) 1464 + goto again; 1465 + if (!rc) 1466 + kvm_s390_release_faultin_array(vcpu->kvm, walk->raw_entries, false); 1467 + } 1495 1468 return rc; 1496 1469 } 1497 1470 1498 1471 /** 1499 - * kvm_s390_shadow_fault - handle fault on a shadow page table 1500 - * @vcpu: virtual cpu 1501 - * @sg: pointer to the shadow guest address space structure 1502 - * @saddr: faulting address in the shadow gmap 1503 - * @datptr: will contain the address of the faulting DAT table entry, or of 1504 - * the valid leaf, plus some flags 1472 + * __gaccess_shadow_fault() - Handle fault on a shadow page table. 1473 + * @vcpu: Virtual cpu that triggered the action. 1474 + * @sg: The shadow guest address space structure. 1475 + * @saddr: Faulting address in the shadow gmap. 1476 + * @datptr: Will contain the address of the faulting DAT table entry, or of 1477 + * the valid leaf, plus some flags. 1478 + * @wr: Whether this is a write access. 1505 1479 * 1506 - * Returns: - 0 if the shadow fault was successfully resolved 1507 - * - > 0 (pgm exception code) on exceptions while faulting 1508 - * - -EAGAIN if the caller can retry immediately 1509 - * - -EFAULT when accessing invalid guest addresses 1510 - * - -ENOMEM if out of memory 1480 + * Return: 1481 + * * %0 if the shadow fault was successfully resolved 1482 + * * > 0 (pgm exception code) on exceptions while faulting 1483 + * * %-EAGAIN if the caller can retry immediately 1484 + * * %-EFAULT when accessing invalid guest addresses 1485 + * * %-ENOMEM if out of memory 1511 1486 */ 1512 - int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, 1513 - unsigned long saddr, unsigned long *datptr) 1487 + static int __gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, 1488 + union mvpg_pei *datptr, bool wr) 1514 1489 { 1515 - union vaddress vaddr; 1516 - union page_table_entry pte; 1517 - unsigned long pgt = 0; 1518 - int dat_protection, fake; 1490 + struct pgtwalk walk = { .p = false, }; 1491 + unsigned long seq; 1519 1492 int rc; 1520 1493 1521 - if (KVM_BUG_ON(!gmap_is_shadow(sg), vcpu->kvm)) 1494 + seq = vcpu->kvm->mmu_invalidate_seq; 1495 + /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */ 1496 + smp_rmb(); 1497 + 1498 + rc = walk_guest_tables(sg, saddr, &walk, wr); 1499 + if (datptr) { 1500 + datptr->val = walk.last_addr; 1501 + datptr->dat_prot = wr && walk.p; 1502 + datptr->not_pte = walk.level > TABLE_TYPE_PAGE_TABLE; 1503 + datptr->real = sg->guest_asce.r; 1504 + } 1505 + if (!rc) 1506 + rc = _gaccess_shadow_fault(vcpu, sg, saddr, seq, &walk); 1507 + if (rc) 1508 + kvm_s390_release_faultin_array(vcpu->kvm, walk.raw_entries, true); 1509 + return rc; 1510 + } 1511 + 1512 + int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, 1513 + union mvpg_pei *datptr, bool wr) 1514 + { 1515 + int rc; 1516 + 1517 + if (KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &sg->flags), vcpu->kvm)) 1522 1518 return -EFAULT; 1523 1519 1524 - mmap_read_lock(sg->mm); 1525 - /* 1526 - * We don't want any guest-2 tables to change - so the parent 1527 - * tables/pointers we read stay valid - unshadowing is however 1528 - * always possible - only guest_table_lock protects us. 1529 - */ 1530 - ipte_lock(vcpu->kvm); 1531 - 1532 - rc = shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake); 1520 + rc = kvm_s390_mmu_cache_topup(vcpu->arch.mc); 1533 1521 if (rc) 1534 - rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection, 1535 - &fake); 1522 + return rc; 1536 1523 1537 - vaddr.addr = saddr; 1538 - if (fake) { 1539 - pte.val = pgt + vaddr.px * PAGE_SIZE; 1540 - goto shadow_page; 1541 - } 1542 - 1543 - switch (rc) { 1544 - case PGM_SEGMENT_TRANSLATION: 1545 - case PGM_REGION_THIRD_TRANS: 1546 - case PGM_REGION_SECOND_TRANS: 1547 - case PGM_REGION_FIRST_TRANS: 1548 - pgt |= PEI_NOT_PTE; 1549 - break; 1550 - case 0: 1551 - pgt += vaddr.px * 8; 1552 - rc = gmap_read_table(sg->parent, pgt, &pte.val); 1553 - } 1554 - if (datptr) 1555 - *datptr = pgt | dat_protection * PEI_DAT_PROT; 1556 - if (!rc && pte.i) 1557 - rc = PGM_PAGE_TRANSLATION; 1558 - if (!rc && pte.z) 1559 - rc = PGM_TRANSLATION_SPEC; 1560 - shadow_page: 1561 - pte.p |= dat_protection; 1562 - if (!rc) 1563 - rc = gmap_shadow_page(sg, saddr, __pte(pte.val)); 1564 - vcpu->kvm->stat.gmap_shadow_pg_entry++; 1524 + ipte_lock(vcpu->kvm); 1525 + rc = __gaccess_shadow_fault(vcpu, sg, saddr, datptr, wr || sg->guest_asce.r); 1565 1526 ipte_unlock(vcpu->kvm); 1566 - mmap_read_unlock(sg->mm); 1527 + 1567 1528 return rc; 1568 1529 }
+12 -6
arch/s390/kvm/gaccess.h
··· 206 206 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, 207 207 void *data, unsigned long len, enum gacc_mode mode); 208 208 209 - int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old_addr, 209 + int cmpxchg_guest_abs_with_key(struct kvm *kvm, gpa_t gpa, int len, union kvm_s390_quad *old, 210 210 union kvm_s390_quad new, u8 access_key, bool *success); 211 211 212 212 /** ··· 450 450 int ipte_lock_held(struct kvm *kvm); 451 451 int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra); 452 452 453 - /* MVPG PEI indication bits */ 454 - #define PEI_DAT_PROT 2 455 - #define PEI_NOT_PTE 4 453 + union mvpg_pei { 454 + unsigned long val; 455 + struct { 456 + unsigned long addr : 61; 457 + unsigned long not_pte : 1; 458 + unsigned long dat_prot: 1; 459 + unsigned long real : 1; 460 + }; 461 + }; 456 462 457 - int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow, 458 - unsigned long saddr, unsigned long *datptr); 463 + int gaccess_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg, gpa_t saddr, 464 + union mvpg_pei *datptr, bool wr); 459 465 460 466 #endif /* __KVM_S390_GACCESS_H */
-141
arch/s390/kvm/gmap-vsie.c
··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Guest memory management for KVM/s390 nested VMs. 4 - * 5 - * Copyright IBM Corp. 2008, 2020, 2024 6 - * 7 - * Author(s): Claudio Imbrenda <imbrenda@linux.ibm.com> 8 - * Martin Schwidefsky <schwidefsky@de.ibm.com> 9 - * David Hildenbrand <david@redhat.com> 10 - * Janosch Frank <frankja@linux.vnet.ibm.com> 11 - */ 12 - 13 - #include <linux/compiler.h> 14 - #include <linux/kvm.h> 15 - #include <linux/kvm_host.h> 16 - #include <linux/pgtable.h> 17 - #include <linux/pagemap.h> 18 - #include <linux/mman.h> 19 - 20 - #include <asm/lowcore.h> 21 - #include <asm/gmap.h> 22 - #include <asm/uv.h> 23 - 24 - #include "kvm-s390.h" 25 - 26 - /** 27 - * gmap_find_shadow - find a specific asce in the list of shadow tables 28 - * @parent: pointer to the parent gmap 29 - * @asce: ASCE for which the shadow table is created 30 - * @edat_level: edat level to be used for the shadow translation 31 - * 32 - * Returns the pointer to a gmap if a shadow table with the given asce is 33 - * already available, ERR_PTR(-EAGAIN) if another one is just being created, 34 - * otherwise NULL 35 - * 36 - * Context: Called with parent->shadow_lock held 37 - */ 38 - static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce, int edat_level) 39 - { 40 - struct gmap *sg; 41 - 42 - lockdep_assert_held(&parent->shadow_lock); 43 - list_for_each_entry(sg, &parent->children, list) { 44 - if (!gmap_shadow_valid(sg, asce, edat_level)) 45 - continue; 46 - if (!sg->initialized) 47 - return ERR_PTR(-EAGAIN); 48 - refcount_inc(&sg->ref_count); 49 - return sg; 50 - } 51 - return NULL; 52 - } 53 - 54 - /** 55 - * gmap_shadow - create/find a shadow guest address space 56 - * @parent: pointer to the parent gmap 57 - * @asce: ASCE for which the shadow table is created 58 - * @edat_level: edat level to be used for the shadow translation 59 - * 60 - * The pages of the top level page table referred by the asce parameter 61 - * will be set to read-only and marked in the PGSTEs of the kvm process. 62 - * The shadow table will be removed automatically on any change to the 63 - * PTE mapping for the source table. 64 - * 65 - * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory, 66 - * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the 67 - * parent gmap table could not be protected. 68 - */ 69 - struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level) 70 - { 71 - struct gmap *sg, *new; 72 - unsigned long limit; 73 - int rc; 74 - 75 - if (KVM_BUG_ON(parent->mm->context.allow_gmap_hpage_1m, (struct kvm *)parent->private) || 76 - KVM_BUG_ON(gmap_is_shadow(parent), (struct kvm *)parent->private)) 77 - return ERR_PTR(-EFAULT); 78 - spin_lock(&parent->shadow_lock); 79 - sg = gmap_find_shadow(parent, asce, edat_level); 80 - spin_unlock(&parent->shadow_lock); 81 - if (sg) 82 - return sg; 83 - /* Create a new shadow gmap */ 84 - limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11)); 85 - if (asce & _ASCE_REAL_SPACE) 86 - limit = -1UL; 87 - new = gmap_alloc(limit); 88 - if (!new) 89 - return ERR_PTR(-ENOMEM); 90 - new->mm = parent->mm; 91 - new->parent = gmap_get(parent); 92 - new->private = parent->private; 93 - new->orig_asce = asce; 94 - new->edat_level = edat_level; 95 - new->initialized = false; 96 - spin_lock(&parent->shadow_lock); 97 - /* Recheck if another CPU created the same shadow */ 98 - sg = gmap_find_shadow(parent, asce, edat_level); 99 - if (sg) { 100 - spin_unlock(&parent->shadow_lock); 101 - gmap_free(new); 102 - return sg; 103 - } 104 - if (asce & _ASCE_REAL_SPACE) { 105 - /* only allow one real-space gmap shadow */ 106 - list_for_each_entry(sg, &parent->children, list) { 107 - if (sg->orig_asce & _ASCE_REAL_SPACE) { 108 - spin_lock(&sg->guest_table_lock); 109 - gmap_unshadow(sg); 110 - spin_unlock(&sg->guest_table_lock); 111 - list_del(&sg->list); 112 - gmap_put(sg); 113 - break; 114 - } 115 - } 116 - } 117 - refcount_set(&new->ref_count, 2); 118 - list_add(&new->list, &parent->children); 119 - if (asce & _ASCE_REAL_SPACE) { 120 - /* nothing to protect, return right away */ 121 - new->initialized = true; 122 - spin_unlock(&parent->shadow_lock); 123 - return new; 124 - } 125 - spin_unlock(&parent->shadow_lock); 126 - /* protect after insertion, so it will get properly invalidated */ 127 - mmap_read_lock(parent->mm); 128 - rc = __kvm_s390_mprotect_many(parent, asce & _ASCE_ORIGIN, 129 - ((asce & _ASCE_TABLE_LENGTH) + 1), 130 - PROT_READ, GMAP_NOTIFY_SHADOW); 131 - mmap_read_unlock(parent->mm); 132 - spin_lock(&parent->shadow_lock); 133 - new->initialized = true; 134 - if (rc) { 135 - list_del(&new->list); 136 - gmap_free(new); 137 - new = ERR_PTR(rc); 138 - } 139 - spin_unlock(&parent->shadow_lock); 140 - return new; 141 - }
+11 -4
arch/s390/kvm/intercept.c
··· 21 21 #include "gaccess.h" 22 22 #include "trace.h" 23 23 #include "trace-s390.h" 24 + #include "faultin.h" 24 25 25 26 u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) 26 27 { ··· 368 367 reg2, &srcaddr, GACC_FETCH, 0); 369 368 if (rc) 370 369 return kvm_s390_inject_prog_cond(vcpu, rc); 371 - rc = kvm_s390_handle_dat_fault(vcpu, srcaddr, 0); 372 - if (rc != 0) 370 + 371 + do { 372 + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(srcaddr), false); 373 + } while (rc == -EAGAIN); 374 + if (rc) 373 375 return rc; 374 376 375 377 /* Ensure that the source is paged-in, no actual access -> no key checking */ ··· 380 376 reg1, &dstaddr, GACC_STORE, 0); 381 377 if (rc) 382 378 return kvm_s390_inject_prog_cond(vcpu, rc); 383 - rc = kvm_s390_handle_dat_fault(vcpu, dstaddr, FOLL_WRITE); 384 - if (rc != 0) 379 + 380 + do { 381 + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gpa_to_gfn(dstaddr), true); 382 + } while (rc == -EAGAIN); 383 + if (rc) 385 384 return rc; 386 385 387 386 kvm_s390_retry_instr(vcpu);
+3 -3
arch/s390/kvm/interrupt.c
··· 26 26 #include <linux/uaccess.h> 27 27 #include <asm/sclp.h> 28 28 #include <asm/isc.h> 29 - #include <asm/gmap.h> 30 29 #include <asm/nmi.h> 31 30 #include <asm/airq.h> 32 31 #include <asm/tpi.h> ··· 33 34 #include "gaccess.h" 34 35 #include "trace-s390.h" 35 36 #include "pci.h" 37 + #include "gmap.h" 36 38 37 39 #define PFAULT_INIT 0x0600 38 40 #define PFAULT_DONE 0x0680 ··· 2632 2632 case KVM_DEV_FLIC_APF_ENABLE: 2633 2633 if (kvm_is_ucontrol(dev->kvm)) 2634 2634 return -EINVAL; 2635 - dev->kvm->arch.gmap->pfault_enabled = 1; 2635 + set_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); 2636 2636 break; 2637 2637 case KVM_DEV_FLIC_APF_DISABLE_WAIT: 2638 2638 if (kvm_is_ucontrol(dev->kvm)) 2639 2639 return -EINVAL; 2640 - dev->kvm->arch.gmap->pfault_enabled = 0; 2640 + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &dev->kvm->arch.gmap->flags); 2641 2641 /* 2642 2642 * Make sure no async faults are in transition when 2643 2643 * clearing the queues. So we don't need to worry
+228 -559
arch/s390/kvm/kvm-s390.c
··· 40 40 #include <asm/lowcore.h> 41 41 #include <asm/machine.h> 42 42 #include <asm/stp.h> 43 - #include <asm/gmap.h> 44 43 #include <asm/gmap_helpers.h> 45 44 #include <asm/nmi.h> 46 45 #include <asm/isc.h> ··· 52 53 #include <asm/uv.h> 53 54 #include "kvm-s390.h" 54 55 #include "gaccess.h" 56 + #include "gmap.h" 57 + #include "faultin.h" 55 58 #include "pci.h" 56 59 57 60 #define CREATE_TRACE_POINTS ··· 265 264 /* available subfunctions indicated via query / "test bit" */ 266 265 static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; 267 266 268 - static struct gmap_notifier gmap_notifier; 269 - static struct gmap_notifier vsie_gmap_notifier; 270 267 debug_info_t *kvm_s390_dbf; 271 268 debug_info_t *kvm_s390_dbf_uv; 272 269 273 270 /* Section: not file related */ 274 271 /* forward declarations */ 275 - static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, 276 - unsigned long end); 277 - 278 272 static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) 279 273 { 280 274 u8 delta_idx = 0; ··· 525 529 if (rc) 526 530 goto err_gib; 527 531 528 - gmap_notifier.notifier_call = kvm_gmap_notifier; 529 - gmap_register_pte_notifier(&gmap_notifier); 530 - vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier; 531 - gmap_register_pte_notifier(&vsie_gmap_notifier); 532 532 atomic_notifier_chain_register(&s390_epoch_delta_notifier, 533 533 &kvm_clock_notifier); 534 534 ··· 544 552 545 553 static void __kvm_s390_exit(void) 546 554 { 547 - gmap_unregister_pte_notifier(&gmap_notifier); 548 - gmap_unregister_pte_notifier(&vsie_gmap_notifier); 549 555 atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, 550 556 &kvm_clock_notifier); 551 557 ··· 559 569 unsigned int ioctl, unsigned long arg) 560 570 { 561 571 if (ioctl == KVM_S390_ENABLE_SIE) 562 - return s390_enable_sie(); 572 + return 0; 563 573 return -EINVAL; 564 574 } 565 575 ··· 688 698 689 699 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 690 700 { 691 - int i; 692 - gfn_t cur_gfn, last_gfn; 693 - unsigned long gaddr, vmaddr; 694 - struct gmap *gmap = kvm->arch.gmap; 695 - DECLARE_BITMAP(bitmap, _PAGE_ENTRIES); 701 + gfn_t last_gfn = memslot->base_gfn + memslot->npages; 696 702 697 - /* Loop over all guest segments */ 698 - cur_gfn = memslot->base_gfn; 699 - last_gfn = memslot->base_gfn + memslot->npages; 700 - for (; cur_gfn <= last_gfn; cur_gfn += _PAGE_ENTRIES) { 701 - gaddr = gfn_to_gpa(cur_gfn); 702 - vmaddr = gfn_to_hva_memslot(memslot, cur_gfn); 703 - if (kvm_is_error_hva(vmaddr)) 704 - continue; 705 - 706 - bitmap_zero(bitmap, _PAGE_ENTRIES); 707 - gmap_sync_dirty_log_pmd(gmap, bitmap, gaddr, vmaddr); 708 - for (i = 0; i < _PAGE_ENTRIES; i++) { 709 - if (test_bit(i, bitmap)) 710 - mark_page_dirty(kvm, cur_gfn + i); 711 - } 712 - 713 - if (fatal_signal_pending(current)) 714 - return; 715 - cond_resched(); 716 - } 703 + scoped_guard(read_lock, &kvm->mmu_lock) 704 + gmap_sync_dirty_log(kvm->arch.gmap, memslot->base_gfn, last_gfn); 717 705 } 718 706 719 707 /* Section: vm related */ ··· 851 883 r = -EINVAL; 852 884 else { 853 885 r = 0; 854 - mmap_write_lock(kvm->mm); 855 - kvm->mm->context.allow_gmap_hpage_1m = 1; 856 - mmap_write_unlock(kvm->mm); 857 886 /* 858 887 * We might have to create fake 4k page 859 888 * tables. To avoid that the hardware works on ··· 923 958 static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *attr) 924 959 { 925 960 int ret; 926 - unsigned int idx; 961 + 927 962 switch (attr->attr) { 928 963 case KVM_S390_VM_MEM_ENABLE_CMMA: 929 964 ret = -ENXIO; ··· 934 969 mutex_lock(&kvm->lock); 935 970 if (kvm->created_vcpus) 936 971 ret = -EBUSY; 937 - else if (kvm->mm->context.allow_gmap_hpage_1m) 938 - ret = -EINVAL; 939 972 else { 940 973 kvm->arch.use_cmma = 1; 941 974 /* Not compatible with cmma. */ ··· 942 979 } 943 980 mutex_unlock(&kvm->lock); 944 981 break; 945 - case KVM_S390_VM_MEM_CLR_CMMA: 982 + case KVM_S390_VM_MEM_CLR_CMMA: { 983 + gfn_t start_gfn = 0; 984 + 946 985 ret = -ENXIO; 947 986 if (!sclp.has_cmma) 948 987 break; ··· 953 988 break; 954 989 955 990 VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); 956 - mutex_lock(&kvm->lock); 957 - idx = srcu_read_lock(&kvm->srcu); 958 - s390_reset_cmma(kvm->arch.gmap->mm); 959 - srcu_read_unlock(&kvm->srcu, idx); 960 - mutex_unlock(&kvm->lock); 991 + do { 992 + start_gfn = dat_reset_cmma(kvm->arch.gmap->asce, start_gfn); 993 + cond_resched(); 994 + } while (start_gfn); 961 995 ret = 0; 962 996 break; 997 + } 963 998 case KVM_S390_VM_MEM_LIMIT_SIZE: { 964 999 unsigned long new_limit; 965 1000 ··· 976 1011 if (!new_limit) 977 1012 return -EINVAL; 978 1013 979 - /* gmap_create takes last usable address */ 980 - if (new_limit != KVM_S390_NO_MEM_LIMIT) 981 - new_limit -= 1; 982 - 983 1014 ret = -EBUSY; 984 - mutex_lock(&kvm->lock); 985 - if (!kvm->created_vcpus) { 986 - /* gmap_create will round the limit up */ 987 - struct gmap *new = gmap_create(current->mm, new_limit); 988 - 989 - if (!new) { 990 - ret = -ENOMEM; 991 - } else { 992 - gmap_remove(kvm->arch.gmap); 993 - new->private = kvm; 994 - kvm->arch.gmap = new; 995 - ret = 0; 996 - } 997 - } 998 - mutex_unlock(&kvm->lock); 1015 + if (!kvm->created_vcpus) 1016 + ret = gmap_set_limit(kvm->arch.gmap, gpa_to_gfn(new_limit)); 999 1017 VM_EVENT(kvm, 3, "SET: max guest address: %lu", new_limit); 1000 1018 VM_EVENT(kvm, 3, "New guest asce: 0x%p", 1001 - (void *) kvm->arch.gmap->asce); 1019 + (void *)kvm->arch.gmap->asce.val); 1002 1020 break; 1003 1021 } 1004 1022 default: ··· 1146 1198 kvm->arch.migration_mode = 1; 1147 1199 return 0; 1148 1200 } 1149 - /* mark all the pages in active slots as dirty */ 1150 1201 kvm_for_each_memslot(ms, bkt, slots) { 1151 1202 if (!ms->dirty_bitmap) 1152 1203 return -EINVAL; 1153 - /* 1154 - * The second half of the bitmap is only used on x86, 1155 - * and would be wasted otherwise, so we put it to good 1156 - * use here to keep track of the state of the storage 1157 - * attributes. 1158 - */ 1159 - memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms)); 1160 1204 ram_pages += ms->npages; 1161 1205 } 1206 + /* mark all the pages as dirty */ 1207 + gmap_set_cmma_all_dirty(kvm->arch.gmap); 1162 1208 atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages); 1163 1209 kvm->arch.migration_mode = 1; 1164 1210 kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION); ··· 2058 2116 2059 2117 static int kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) 2060 2118 { 2061 - uint8_t *keys; 2062 - uint64_t hva; 2063 - int srcu_idx, i, r = 0; 2119 + union skey *keys; 2120 + int i, r = 0; 2064 2121 2065 2122 if (args->flags != 0) 2066 2123 return -EINVAL; 2067 2124 2068 2125 /* Is this guest using storage keys? */ 2069 - if (!mm_uses_skeys(current->mm)) 2126 + if (!uses_skeys(kvm->arch.gmap)) 2070 2127 return KVM_S390_GET_SKEYS_NONE; 2071 2128 2072 2129 /* Enforce sane limit on memory allocation */ 2073 2130 if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) 2074 2131 return -EINVAL; 2075 2132 2076 - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); 2133 + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); 2077 2134 if (!keys) 2078 2135 return -ENOMEM; 2079 2136 2080 - mmap_read_lock(current->mm); 2081 - srcu_idx = srcu_read_lock(&kvm->srcu); 2082 - for (i = 0; i < args->count; i++) { 2083 - hva = gfn_to_hva(kvm, args->start_gfn + i); 2084 - if (kvm_is_error_hva(hva)) { 2085 - r = -EFAULT; 2086 - break; 2137 + scoped_guard(read_lock, &kvm->mmu_lock) { 2138 + for (i = 0; i < args->count; i++) { 2139 + r = dat_get_storage_key(kvm->arch.gmap->asce, 2140 + args->start_gfn + i, keys + i); 2141 + if (r) 2142 + break; 2087 2143 } 2088 - 2089 - r = get_guest_storage_key(current->mm, hva, &keys[i]); 2090 - if (r) 2091 - break; 2092 2144 } 2093 - srcu_read_unlock(&kvm->srcu, srcu_idx); 2094 - mmap_read_unlock(current->mm); 2095 2145 2096 2146 if (!r) { 2097 2147 r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys, ··· 2098 2164 2099 2165 static int kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) 2100 2166 { 2101 - uint8_t *keys; 2102 - uint64_t hva; 2103 - int srcu_idx, i, r = 0; 2104 - bool unlocked; 2167 + struct kvm_s390_mmu_cache *mc; 2168 + union skey *keys; 2169 + int i, r = 0; 2105 2170 2106 2171 if (args->flags != 0) 2107 2172 return -EINVAL; ··· 2109 2176 if (args->count < 1 || args->count > KVM_S390_SKEYS_MAX) 2110 2177 return -EINVAL; 2111 2178 2112 - keys = kvmalloc_array(args->count, sizeof(uint8_t), GFP_KERNEL_ACCOUNT); 2179 + keys = kvmalloc_array(args->count, sizeof(*keys), GFP_KERNEL_ACCOUNT); 2113 2180 if (!keys) 2114 2181 return -ENOMEM; 2115 2182 ··· 2121 2188 } 2122 2189 2123 2190 /* Enable storage key handling for the guest */ 2124 - r = s390_enable_skey(); 2191 + r = gmap_enable_skeys(kvm->arch.gmap); 2125 2192 if (r) 2126 2193 goto out; 2127 2194 2128 - i = 0; 2129 - mmap_read_lock(current->mm); 2130 - srcu_idx = srcu_read_lock(&kvm->srcu); 2131 - while (i < args->count) { 2132 - unlocked = false; 2133 - hva = gfn_to_hva(kvm, args->start_gfn + i); 2134 - if (kvm_is_error_hva(hva)) { 2135 - r = -EFAULT; 2136 - break; 2137 - } 2138 - 2195 + r = -EINVAL; 2196 + for (i = 0; i < args->count; i++) { 2139 2197 /* Lowest order bit is reserved */ 2140 - if (keys[i] & 0x01) { 2141 - r = -EINVAL; 2142 - break; 2143 - } 2144 - 2145 - r = set_guest_storage_key(current->mm, hva, keys[i], 0); 2146 - if (r) { 2147 - r = fixup_user_fault(current->mm, hva, 2148 - FAULT_FLAG_WRITE, &unlocked); 2149 - if (r) 2150 - break; 2151 - } 2152 - if (!r) 2153 - i++; 2198 + if (keys[i].zero) 2199 + goto out; 2154 2200 } 2155 - srcu_read_unlock(&kvm->srcu, srcu_idx); 2156 - mmap_read_unlock(current->mm); 2201 + 2202 + mc = kvm_s390_new_mmu_cache(); 2203 + if (!mc) { 2204 + r = -ENOMEM; 2205 + goto out; 2206 + } 2207 + 2208 + r = 0; 2209 + do { 2210 + r = kvm_s390_mmu_cache_topup(mc); 2211 + if (r == -ENOMEM) 2212 + break; 2213 + scoped_guard(read_lock, &kvm->mmu_lock) { 2214 + for (i = 0 ; i < args->count; i++) { 2215 + r = dat_set_storage_key(mc, kvm->arch.gmap->asce, 2216 + args->start_gfn + i, keys[i], 0); 2217 + if (r) 2218 + break; 2219 + } 2220 + } 2221 + } while (r == -ENOMEM); 2222 + kvm_s390_free_mmu_cache(mc); 2157 2223 out: 2158 2224 kvfree(keys); 2159 2225 return r; 2160 - } 2161 - 2162 - /* 2163 - * Base address and length must be sent at the start of each block, therefore 2164 - * it's cheaper to send some clean data, as long as it's less than the size of 2165 - * two longs. 2166 - */ 2167 - #define KVM_S390_MAX_BIT_DISTANCE (2 * sizeof(void *)) 2168 - /* for consistency */ 2169 - #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX) 2170 - 2171 - static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, 2172 - u8 *res, unsigned long bufsize) 2173 - { 2174 - unsigned long pgstev, hva, cur_gfn = args->start_gfn; 2175 - 2176 - args->count = 0; 2177 - while (args->count < bufsize) { 2178 - hva = gfn_to_hva(kvm, cur_gfn); 2179 - /* 2180 - * We return an error if the first value was invalid, but we 2181 - * return successfully if at least one value was copied. 2182 - */ 2183 - if (kvm_is_error_hva(hva)) 2184 - return args->count ? 0 : -EFAULT; 2185 - if (get_pgste(kvm->mm, hva, &pgstev) < 0) 2186 - pgstev = 0; 2187 - res[args->count++] = (pgstev >> 24) & 0x43; 2188 - cur_gfn++; 2189 - } 2190 - 2191 - return 0; 2192 - } 2193 - 2194 - static struct kvm_memory_slot *gfn_to_memslot_approx(struct kvm_memslots *slots, 2195 - gfn_t gfn) 2196 - { 2197 - return ____gfn_to_memslot(slots, gfn, true); 2198 - } 2199 - 2200 - static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, 2201 - unsigned long cur_gfn) 2202 - { 2203 - struct kvm_memory_slot *ms = gfn_to_memslot_approx(slots, cur_gfn); 2204 - unsigned long ofs = cur_gfn - ms->base_gfn; 2205 - struct rb_node *mnode = &ms->gfn_node[slots->node_idx]; 2206 - 2207 - if (ms->base_gfn + ms->npages <= cur_gfn) { 2208 - mnode = rb_next(mnode); 2209 - /* If we are above the highest slot, wrap around */ 2210 - if (!mnode) 2211 - mnode = rb_first(&slots->gfn_tree); 2212 - 2213 - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); 2214 - ofs = 0; 2215 - } 2216 - 2217 - if (cur_gfn < ms->base_gfn) 2218 - ofs = 0; 2219 - 2220 - ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); 2221 - while (ofs >= ms->npages && (mnode = rb_next(mnode))) { 2222 - ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); 2223 - ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages); 2224 - } 2225 - return ms->base_gfn + ofs; 2226 - } 2227 - 2228 - static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, 2229 - u8 *res, unsigned long bufsize) 2230 - { 2231 - unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev; 2232 - struct kvm_memslots *slots = kvm_memslots(kvm); 2233 - struct kvm_memory_slot *ms; 2234 - 2235 - if (unlikely(kvm_memslots_empty(slots))) 2236 - return 0; 2237 - 2238 - cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); 2239 - ms = gfn_to_memslot(kvm, cur_gfn); 2240 - args->count = 0; 2241 - args->start_gfn = cur_gfn; 2242 - if (!ms) 2243 - return 0; 2244 - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); 2245 - mem_end = kvm_s390_get_gfn_end(slots); 2246 - 2247 - while (args->count < bufsize) { 2248 - hva = gfn_to_hva(kvm, cur_gfn); 2249 - if (kvm_is_error_hva(hva)) 2250 - return 0; 2251 - /* Decrement only if we actually flipped the bit to 0 */ 2252 - if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) 2253 - atomic64_dec(&kvm->arch.cmma_dirty_pages); 2254 - if (get_pgste(kvm->mm, hva, &pgstev) < 0) 2255 - pgstev = 0; 2256 - /* Save the value */ 2257 - res[args->count++] = (pgstev >> 24) & 0x43; 2258 - /* If the next bit is too far away, stop. */ 2259 - if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE) 2260 - return 0; 2261 - /* If we reached the previous "next", find the next one */ 2262 - if (cur_gfn == next_gfn) 2263 - next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1); 2264 - /* Reached the end of memory or of the buffer, stop */ 2265 - if ((next_gfn >= mem_end) || 2266 - (next_gfn - args->start_gfn >= bufsize)) 2267 - return 0; 2268 - cur_gfn++; 2269 - /* Reached the end of the current memslot, take the next one. */ 2270 - if (cur_gfn - ms->base_gfn >= ms->npages) { 2271 - ms = gfn_to_memslot(kvm, cur_gfn); 2272 - if (!ms) 2273 - return 0; 2274 - } 2275 - } 2276 - return 0; 2277 2226 } 2278 2227 2279 2228 /* ··· 2169 2354 static int kvm_s390_get_cmma_bits(struct kvm *kvm, 2170 2355 struct kvm_s390_cmma_log *args) 2171 2356 { 2172 - unsigned long bufsize; 2173 - int srcu_idx, peek, ret; 2357 + int peek, ret; 2174 2358 u8 *values; 2175 2359 2176 2360 if (!kvm->arch.use_cmma) ··· 2182 2368 if (!peek && !kvm->arch.migration_mode) 2183 2369 return -EINVAL; 2184 2370 /* CMMA is disabled or was not used, or the buffer has length zero */ 2185 - bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX); 2186 - if (!bufsize || !kvm->mm->context.uses_cmm) { 2371 + args->count = min(args->count, KVM_S390_CMMA_SIZE_MAX); 2372 + if (!args->count || !uses_cmm(kvm->arch.gmap)) { 2187 2373 memset(args, 0, sizeof(*args)); 2188 2374 return 0; 2189 2375 } ··· 2193 2379 return 0; 2194 2380 } 2195 2381 2196 - values = vmalloc(bufsize); 2382 + values = vmalloc(args->count); 2197 2383 if (!values) 2198 2384 return -ENOMEM; 2199 2385 2200 - mmap_read_lock(kvm->mm); 2201 - srcu_idx = srcu_read_lock(&kvm->srcu); 2202 - if (peek) 2203 - ret = kvm_s390_peek_cmma(kvm, args, values, bufsize); 2204 - else 2205 - ret = kvm_s390_get_cmma(kvm, args, values, bufsize); 2206 - srcu_read_unlock(&kvm->srcu, srcu_idx); 2207 - mmap_read_unlock(kvm->mm); 2386 + scoped_guard(read_lock, &kvm->mmu_lock) { 2387 + if (peek) 2388 + ret = dat_peek_cmma(args->start_gfn, kvm->arch.gmap->asce, &args->count, 2389 + values); 2390 + else 2391 + ret = dat_get_cmma(kvm->arch.gmap->asce, &args->start_gfn, &args->count, 2392 + values, &kvm->arch.cmma_dirty_pages); 2393 + } 2208 2394 2209 2395 if (kvm->arch.migration_mode) 2210 2396 args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages); ··· 2226 2412 static int kvm_s390_set_cmma_bits(struct kvm *kvm, 2227 2413 const struct kvm_s390_cmma_log *args) 2228 2414 { 2229 - unsigned long hva, mask, pgstev, i; 2230 - uint8_t *bits; 2231 - int srcu_idx, r = 0; 2232 - 2233 - mask = args->mask; 2415 + struct kvm_s390_mmu_cache *mc; 2416 + u8 *bits = NULL; 2417 + int r = 0; 2234 2418 2235 2419 if (!kvm->arch.use_cmma) 2236 2420 return -ENXIO; ··· 2242 2430 if (args->count == 0) 2243 2431 return 0; 2244 2432 2433 + mc = kvm_s390_new_mmu_cache(); 2434 + if (!mc) 2435 + return -ENOMEM; 2245 2436 bits = vmalloc(array_size(sizeof(*bits), args->count)); 2246 2437 if (!bits) 2247 - return -ENOMEM; 2438 + goto out; 2248 2439 2249 2440 r = copy_from_user(bits, (void __user *)args->values, args->count); 2250 2441 if (r) { ··· 2255 2440 goto out; 2256 2441 } 2257 2442 2258 - mmap_read_lock(kvm->mm); 2259 - srcu_idx = srcu_read_lock(&kvm->srcu); 2260 - for (i = 0; i < args->count; i++) { 2261 - hva = gfn_to_hva(kvm, args->start_gfn + i); 2262 - if (kvm_is_error_hva(hva)) { 2263 - r = -EFAULT; 2443 + do { 2444 + r = kvm_s390_mmu_cache_topup(mc); 2445 + if (r) 2264 2446 break; 2447 + scoped_guard(read_lock, &kvm->mmu_lock) { 2448 + r = dat_set_cmma_bits(mc, kvm->arch.gmap->asce, args->start_gfn, 2449 + args->count, args->mask, bits); 2265 2450 } 2451 + } while (r == -ENOMEM); 2266 2452 2267 - pgstev = bits[i]; 2268 - pgstev = pgstev << 24; 2269 - mask &= _PGSTE_GPS_USAGE_MASK | _PGSTE_GPS_NODAT; 2270 - set_pgste_bits(kvm->mm, hva, mask, pgstev); 2271 - } 2272 - srcu_read_unlock(&kvm->srcu, srcu_idx); 2273 - mmap_read_unlock(kvm->mm); 2274 - 2275 - if (!kvm->mm->context.uses_cmm) { 2276 - mmap_write_lock(kvm->mm); 2277 - kvm->mm->context.uses_cmm = 1; 2278 - mmap_write_unlock(kvm->mm); 2279 - } 2453 + set_bit(GMAP_FLAG_USES_CMM, &kvm->arch.gmap->flags); 2280 2454 out: 2455 + kvm_s390_free_mmu_cache(mc); 2281 2456 vfree(bits); 2282 2457 return r; 2283 2458 } ··· 2476 2671 break; 2477 2672 2478 2673 mmap_write_lock(kvm->mm); 2674 + /* 2675 + * Disable creation of new THPs. Existing THPs can stay, they 2676 + * will be split when any part of them gets imported. 2677 + */ 2678 + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, kvm->mm); 2679 + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, kvm->mm); 2680 + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); 2479 2681 r = gmap_helper_disable_cow_sharing(); 2480 2682 mmap_write_unlock(kvm->mm); 2481 2683 if (r) ··· 2730 2918 acc_mode = mop->op == KVM_S390_MEMOP_ABSOLUTE_READ ? GACC_FETCH : GACC_STORE; 2731 2919 2732 2920 scoped_guard(srcu, &kvm->srcu) { 2733 - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) 2734 - return PGM_ADDRESSING; 2735 - 2736 2921 if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) 2737 2922 return check_gpa_range(kvm, mop->gaddr, mop->size, acc_mode, mop->key); 2738 2923 ··· 2742 2933 if (acc_mode != GACC_STORE && copy_to_user(uaddr, tmpbuf, mop->size)) 2743 2934 return -EFAULT; 2744 2935 } 2745 - 2746 2936 return 0; 2747 2937 } 2748 2938 ··· 2770 2962 return -EFAULT; 2771 2963 2772 2964 scoped_guard(srcu, &kvm->srcu) { 2773 - if (!kvm_is_gpa_in_memslot(kvm, mop->gaddr)) 2774 - return PGM_ADDRESSING; 2775 - 2776 2965 r = cmpxchg_guest_abs_with_key(kvm, mop->gaddr, mop->size, &old, new, 2777 2966 mop->key, &success); 2778 2967 ··· 3127 3322 if (type) 3128 3323 goto out_err; 3129 3324 #endif 3130 - 3131 - rc = s390_enable_sie(); 3132 - if (rc) 3133 - goto out_err; 3134 - 3135 3325 rc = -ENOMEM; 3136 3326 3137 3327 if (!sclp.has_64bscao) ··· 3200 3400 debug_register_view(kvm->arch.dbf, &debug_sprintf_view); 3201 3401 VM_EVENT(kvm, 3, "vm created with type %lu", type); 3202 3402 3403 + kvm->arch.mem_limit = type & KVM_VM_S390_UCONTROL ? KVM_S390_NO_MEM_LIMIT : sclp.hamax + 1; 3404 + kvm->arch.gmap = gmap_new(kvm, gpa_to_gfn(kvm->arch.mem_limit)); 3405 + if (!kvm->arch.gmap) 3406 + goto out_err; 3407 + clear_bit(GMAP_FLAG_PFAULT_ENABLED, &kvm->arch.gmap->flags); 3408 + 3203 3409 if (type & KVM_VM_S390_UCONTROL) { 3204 3410 struct kvm_userspace_memory_region2 fake_memslot = { 3205 3411 .slot = KVM_S390_UCONTROL_MEMSLOT, ··· 3215 3409 .flags = 0, 3216 3410 }; 3217 3411 3218 - kvm->arch.gmap = NULL; 3219 - kvm->arch.mem_limit = KVM_S390_NO_MEM_LIMIT; 3220 3412 /* one flat fake memslot covering the whole address-space */ 3221 3413 mutex_lock(&kvm->slots_lock); 3222 3414 KVM_BUG_ON(kvm_set_internal_memslot(kvm, &fake_memslot), kvm); 3223 3415 mutex_unlock(&kvm->slots_lock); 3416 + set_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); 3224 3417 } else { 3225 - if (sclp.hamax == U64_MAX) 3226 - kvm->arch.mem_limit = TASK_SIZE_MAX; 3227 - else 3228 - kvm->arch.mem_limit = min_t(unsigned long, TASK_SIZE_MAX, 3229 - sclp.hamax + 1); 3230 - kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1); 3231 - if (!kvm->arch.gmap) 3232 - goto out_err; 3233 - kvm->arch.gmap->private = kvm; 3234 - kvm->arch.gmap->pfault_enabled = 0; 3418 + struct crst_table *table = dereference_asce(kvm->arch.gmap->asce); 3419 + 3420 + crst_table_init((void *)table, _CRSTE_HOLE(table->crstes[0].h.tt).val); 3235 3421 } 3236 3422 3237 3423 kvm->arch.use_pfmfi = sclp.has_pfmfi; ··· 3257 3459 sca_del_vcpu(vcpu); 3258 3460 kvm_s390_update_topology_change_report(vcpu->kvm, 1); 3259 3461 3260 - if (kvm_is_ucontrol(vcpu->kvm)) 3261 - gmap_remove(vcpu->arch.gmap); 3462 + if (kvm_is_ucontrol(vcpu->kvm)) { 3463 + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) 3464 + gmap_remove_child(vcpu->arch.gmap); 3465 + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); 3466 + } 3262 3467 3263 3468 if (vcpu->kvm->arch.use_cmma) 3264 3469 kvm_s390_vcpu_unsetup_cmma(vcpu); ··· 3269 3468 if (kvm_s390_pv_cpu_get_handle(vcpu)) 3270 3469 kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); 3271 3470 free_page((unsigned long)(vcpu->arch.sie_block)); 3471 + kvm_s390_free_mmu_cache(vcpu->arch.mc); 3272 3472 } 3273 3473 3274 3474 void kvm_arch_destroy_vm(struct kvm *kvm) ··· 3296 3494 3297 3495 debug_unregister(kvm->arch.dbf); 3298 3496 free_page((unsigned long)kvm->arch.sie_page2); 3299 - if (!kvm_is_ucontrol(kvm)) 3300 - gmap_remove(kvm->arch.gmap); 3301 3497 kvm_s390_destroy_adapters(kvm); 3302 3498 kvm_s390_clear_float_irqs(kvm); 3303 3499 kvm_s390_vsie_destroy(kvm); 3500 + kvm->arch.gmap = gmap_put(kvm->arch.gmap); 3304 3501 KVM_EVENT(3, "vm 0x%p destroyed", kvm); 3305 3502 } 3306 3503 3307 3504 /* Section: vcpu related */ 3308 - static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu) 3309 - { 3310 - vcpu->arch.gmap = gmap_create(current->mm, -1UL); 3311 - if (!vcpu->arch.gmap) 3312 - return -ENOMEM; 3313 - vcpu->arch.gmap->private = vcpu->kvm; 3314 - 3315 - return 0; 3316 - } 3317 - 3318 3505 static void sca_del_vcpu(struct kvm_vcpu *vcpu) 3319 3506 { 3320 3507 struct esca_block *sca = vcpu->kvm->arch.sca; ··· 3644 3853 int rc; 3645 3854 3646 3855 BUILD_BUG_ON(sizeof(struct sie_page) != 4096); 3647 - sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); 3648 - if (!sie_page) 3856 + vcpu->arch.mc = kvm_s390_new_mmu_cache(); 3857 + if (!vcpu->arch.mc) 3649 3858 return -ENOMEM; 3859 + sie_page = (struct sie_page *) get_zeroed_page(GFP_KERNEL_ACCOUNT); 3860 + if (!sie_page) { 3861 + kvm_s390_free_mmu_cache(vcpu->arch.mc); 3862 + vcpu->arch.mc = NULL; 3863 + return -ENOMEM; 3864 + } 3650 3865 3651 3866 vcpu->arch.sie_block = &sie_page->sie_block; 3652 3867 vcpu->arch.sie_block->itdba = virt_to_phys(&sie_page->itdb); ··· 3694 3897 vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS; 3695 3898 3696 3899 if (kvm_is_ucontrol(vcpu->kvm)) { 3697 - rc = __kvm_ucontrol_vcpu_init(vcpu); 3698 - if (rc) 3900 + rc = -ENOMEM; 3901 + vcpu->arch.gmap = gmap_new_child(vcpu->kvm->arch.gmap, -1UL); 3902 + if (!vcpu->arch.gmap) 3699 3903 goto out_free_sie_block; 3700 3904 } 3701 3905 ··· 3712 3914 return 0; 3713 3915 3714 3916 out_ucontrol_uninit: 3715 - if (kvm_is_ucontrol(vcpu->kvm)) 3716 - gmap_remove(vcpu->arch.gmap); 3917 + if (kvm_is_ucontrol(vcpu->kvm)) { 3918 + gmap_remove_child(vcpu->arch.gmap); 3919 + vcpu->arch.gmap = gmap_put(vcpu->arch.gmap); 3920 + } 3717 3921 out_free_sie_block: 3718 3922 free_page((unsigned long)(vcpu->arch.sie_block)); 3719 3923 return rc; ··· 3777 3977 { 3778 3978 __kvm_make_request(req, vcpu); 3779 3979 kvm_s390_vcpu_request(vcpu); 3780 - } 3781 - 3782 - static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, 3783 - unsigned long end) 3784 - { 3785 - struct kvm *kvm = gmap->private; 3786 - struct kvm_vcpu *vcpu; 3787 - unsigned long prefix; 3788 - unsigned long i; 3789 - 3790 - trace_kvm_s390_gmap_notifier(start, end, gmap_is_shadow(gmap)); 3791 - 3792 - if (gmap_is_shadow(gmap)) 3793 - return; 3794 - if (start >= 1UL << 31) 3795 - /* We are only interested in prefix pages */ 3796 - return; 3797 - kvm_for_each_vcpu(i, vcpu, kvm) { 3798 - /* match against both prefix pages */ 3799 - prefix = kvm_s390_get_prefix(vcpu); 3800 - if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) { 3801 - VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx", 3802 - start, end); 3803 - kvm_s390_sync_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 3804 - } 3805 - } 3806 3980 } 3807 3981 3808 3982 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) ··· 4160 4386 return kvm_s390_test_cpuflags(vcpu, CPUSTAT_IBS); 4161 4387 } 4162 4388 4163 - static int __kvm_s390_fixup_fault_sync(struct gmap *gmap, gpa_t gaddr, unsigned int flags) 4389 + static int vcpu_ucontrol_translate(struct kvm_vcpu *vcpu, gpa_t *gaddr) 4164 4390 { 4165 - struct kvm *kvm = gmap->private; 4166 - gfn_t gfn = gpa_to_gfn(gaddr); 4167 - bool unlocked; 4168 - hva_t vmaddr; 4169 - gpa_t tmp; 4170 4391 int rc; 4171 4392 4172 - if (kvm_is_ucontrol(kvm)) { 4173 - tmp = __gmap_translate(gmap, gaddr); 4174 - gfn = gpa_to_gfn(tmp); 4175 - } 4176 - 4177 - vmaddr = gfn_to_hva(kvm, gfn); 4178 - rc = fixup_user_fault(gmap->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); 4179 - if (!rc) 4180 - rc = __gmap_link(gmap, gaddr, vmaddr); 4181 - return rc; 4182 - } 4183 - 4184 - /** 4185 - * __kvm_s390_mprotect_many() - Apply specified protection to guest pages 4186 - * @gmap: the gmap of the guest 4187 - * @gpa: the starting guest address 4188 - * @npages: how many pages to protect 4189 - * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE 4190 - * @bits: pgste notification bits to set 4191 - * 4192 - * Returns: 0 in case of success, < 0 in case of error - see gmap_protect_one() 4193 - * 4194 - * Context: kvm->srcu and gmap->mm need to be held in read mode 4195 - */ 4196 - int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, 4197 - unsigned long bits) 4198 - { 4199 - unsigned int fault_flag = (prot & PROT_WRITE) ? FAULT_FLAG_WRITE : 0; 4200 - gpa_t end = gpa + npages * PAGE_SIZE; 4201 - int rc; 4202 - 4203 - for (; gpa < end; gpa = ALIGN(gpa + 1, rc)) { 4204 - rc = gmap_protect_one(gmap, gpa, prot, bits); 4205 - if (rc == -EAGAIN) { 4206 - __kvm_s390_fixup_fault_sync(gmap, gpa, fault_flag); 4207 - rc = gmap_protect_one(gmap, gpa, prot, bits); 4393 + if (kvm_is_ucontrol(vcpu->kvm)) { 4394 + rc = gmap_ucas_translate(vcpu->arch.mc, vcpu->arch.gmap, gaddr); 4395 + if (rc == -EREMOTE) { 4396 + vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; 4397 + vcpu->run->s390_ucontrol.trans_exc_code = *gaddr; 4398 + vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; 4208 4399 } 4209 - if (rc < 0) 4210 - return rc; 4400 + return rc; 4211 4401 } 4212 - 4213 4402 return 0; 4214 4403 } 4215 4404 4216 - static int kvm_s390_mprotect_notify_prefix(struct kvm_vcpu *vcpu) 4405 + static int kvm_s390_fixup_prefix(struct kvm_vcpu *vcpu) 4217 4406 { 4218 4407 gpa_t gaddr = kvm_s390_get_prefix(vcpu); 4219 - int idx, rc; 4408 + gfn_t gfn; 4409 + int rc; 4220 4410 4221 - idx = srcu_read_lock(&vcpu->kvm->srcu); 4222 - mmap_read_lock(vcpu->arch.gmap->mm); 4411 + if (vcpu_ucontrol_translate(vcpu, &gaddr)) 4412 + return -EREMOTE; 4413 + gfn = gpa_to_gfn(gaddr); 4223 4414 4224 - rc = __kvm_s390_mprotect_many(vcpu->arch.gmap, gaddr, 2, PROT_WRITE, GMAP_NOTIFY_MPROT); 4415 + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn, true); 4416 + if (rc) 4417 + return rc; 4418 + rc = kvm_s390_faultin_gfn_simple(vcpu, NULL, gfn + 1, true); 4419 + if (rc) 4420 + return rc; 4225 4421 4226 - mmap_read_unlock(vcpu->arch.gmap->mm); 4227 - srcu_read_unlock(&vcpu->kvm->srcu, idx); 4228 - 4422 + scoped_guard(write_lock, &vcpu->kvm->mmu_lock) 4423 + rc = dat_set_prefix_notif_bit(vcpu->kvm->arch.gmap->asce, gfn); 4229 4424 return rc; 4230 4425 } 4231 4426 ··· 4214 4471 if (kvm_check_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu)) { 4215 4472 int rc; 4216 4473 4217 - rc = kvm_s390_mprotect_notify_prefix(vcpu); 4474 + rc = kvm_s390_fixup_prefix(vcpu); 4218 4475 if (rc) { 4219 4476 kvm_make_request(KVM_REQ_REFRESH_GUEST_PREFIX, vcpu); 4220 4477 return rc; ··· 4263 4520 * Re-enable CMM virtualization if CMMA is available and 4264 4521 * CMM has been used. 4265 4522 */ 4266 - if ((vcpu->kvm->arch.use_cmma) && 4267 - (vcpu->kvm->mm->context.uses_cmm)) 4523 + if (vcpu->kvm->arch.use_cmma && uses_cmm(vcpu->arch.gmap)) 4268 4524 vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; 4269 4525 goto retry; 4270 4526 } ··· 4375 4633 return false; 4376 4634 if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) 4377 4635 return false; 4378 - if (!vcpu->arch.gmap->pfault_enabled) 4636 + if (!pfault_enabled(vcpu->arch.gmap)) 4379 4637 return false; 4380 4638 4381 4639 hva = gfn_to_hva(vcpu->kvm, current->thread.gmap_teid.addr); ··· 4468 4726 current->thread.gmap_int_code, current->thread.gmap_teid.val); 4469 4727 } 4470 4728 4471 - /* 4472 - * __kvm_s390_handle_dat_fault() - handle a dat fault for the gmap of a vcpu 4473 - * @vcpu: the vCPU whose gmap is to be fixed up 4474 - * @gfn: the guest frame number used for memslots (including fake memslots) 4475 - * @gaddr: the gmap address, does not have to match @gfn for ucontrol gmaps 4476 - * @foll: FOLL_* flags 4477 - * 4478 - * Return: 0 on success, < 0 in case of error. 4479 - * Context: The mm lock must not be held before calling. May sleep. 4480 - */ 4481 - int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int foll) 4729 + static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, gpa_t gaddr, bool wr) 4482 4730 { 4483 - struct kvm_memory_slot *slot; 4484 - unsigned int fault_flags; 4485 - bool writable, unlocked; 4486 - unsigned long vmaddr; 4487 - struct page *page; 4488 - kvm_pfn_t pfn; 4731 + struct guest_fault f = { 4732 + .write_attempt = wr, 4733 + .attempt_pfault = pfault_enabled(vcpu->arch.gmap), 4734 + }; 4489 4735 int rc; 4490 4736 4491 - slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 4492 - if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 4737 + if (vcpu_ucontrol_translate(vcpu, &gaddr)) 4738 + return -EREMOTE; 4739 + f.gfn = gpa_to_gfn(gaddr); 4740 + 4741 + rc = kvm_s390_faultin_gfn(vcpu, NULL, &f); 4742 + if (rc <= 0) 4743 + return rc; 4744 + if (rc == PGM_ADDRESSING) 4493 4745 return vcpu_post_run_addressing_exception(vcpu); 4494 - 4495 - fault_flags = foll & FOLL_WRITE ? FAULT_FLAG_WRITE : 0; 4496 - if (vcpu->arch.gmap->pfault_enabled) 4497 - foll |= FOLL_NOWAIT; 4498 - vmaddr = __gfn_to_hva_memslot(slot, gfn); 4499 - 4500 - try_again: 4501 - pfn = __kvm_faultin_pfn(slot, gfn, foll, &writable, &page); 4502 - 4503 - /* Access outside memory, inject addressing exception */ 4504 - if (is_noslot_pfn(pfn)) 4505 - return vcpu_post_run_addressing_exception(vcpu); 4506 - /* Signal pending: try again */ 4507 - if (pfn == KVM_PFN_ERR_SIGPENDING) 4508 - return -EAGAIN; 4509 - 4510 - /* Needs I/O, try to setup async pfault (only possible with FOLL_NOWAIT) */ 4511 - if (pfn == KVM_PFN_ERR_NEEDS_IO) { 4512 - trace_kvm_s390_major_guest_pfault(vcpu); 4513 - if (kvm_arch_setup_async_pf(vcpu)) 4514 - return 0; 4515 - vcpu->stat.pfault_sync++; 4516 - /* Could not setup async pfault, try again synchronously */ 4517 - foll &= ~FOLL_NOWAIT; 4518 - goto try_again; 4519 - } 4520 - /* Any other error */ 4521 - if (is_error_pfn(pfn)) 4522 - return -EFAULT; 4523 - 4524 - /* Success */ 4525 - mmap_read_lock(vcpu->arch.gmap->mm); 4526 - /* Mark the userspace PTEs as young and/or dirty, to avoid page fault loops */ 4527 - rc = fixup_user_fault(vcpu->arch.gmap->mm, vmaddr, fault_flags, &unlocked); 4528 - if (!rc) 4529 - rc = __gmap_link(vcpu->arch.gmap, gaddr, vmaddr); 4530 - scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { 4531 - kvm_release_faultin_page(vcpu->kvm, page, false, writable); 4532 - } 4533 - mmap_read_unlock(vcpu->arch.gmap->mm); 4534 - return rc; 4535 - } 4536 - 4537 - static int vcpu_dat_fault_handler(struct kvm_vcpu *vcpu, unsigned long gaddr, unsigned int foll) 4538 - { 4539 - unsigned long gaddr_tmp; 4540 - gfn_t gfn; 4541 - 4542 - gfn = gpa_to_gfn(gaddr); 4543 - if (kvm_is_ucontrol(vcpu->kvm)) { 4544 - /* 4545 - * This translates the per-vCPU guest address into a 4546 - * fake guest address, which can then be used with the 4547 - * fake memslots that are identity mapping userspace. 4548 - * This allows ucontrol VMs to use the normal fault 4549 - * resolution path, like normal VMs. 4550 - */ 4551 - mmap_read_lock(vcpu->arch.gmap->mm); 4552 - gaddr_tmp = __gmap_translate(vcpu->arch.gmap, gaddr); 4553 - mmap_read_unlock(vcpu->arch.gmap->mm); 4554 - if (gaddr_tmp == -EFAULT) { 4555 - vcpu->run->exit_reason = KVM_EXIT_S390_UCONTROL; 4556 - vcpu->run->s390_ucontrol.trans_exc_code = gaddr; 4557 - vcpu->run->s390_ucontrol.pgm_code = PGM_SEGMENT_TRANSLATION; 4558 - return -EREMOTE; 4559 - } 4560 - gfn = gpa_to_gfn(gaddr_tmp); 4561 - } 4562 - return __kvm_s390_handle_dat_fault(vcpu, gfn, gaddr, foll); 4746 + KVM_BUG_ON(rc, vcpu->kvm); 4747 + return -EINVAL; 4563 4748 } 4564 4749 4565 4750 static int vcpu_post_run_handle_fault(struct kvm_vcpu *vcpu) ··· 4663 4994 4664 4995 exit_reason = kvm_s390_enter_exit_sie(vcpu->arch.sie_block, 4665 4996 vcpu->run->s.regs.gprs, 4666 - vcpu->arch.gmap->asce); 4997 + vcpu->arch.gmap->asce.val); 4667 4998 4668 4999 __enable_cpu_timer_accounting(vcpu); 4669 5000 guest_timing_exit_irqoff(); ··· 5198 5529 struct kvm_s390_mem_op *mop) 5199 5530 { 5200 5531 void __user *uaddr = (void __user *)mop->buf; 5532 + void *tmpbuf __free(kvfree) = NULL; 5201 5533 enum gacc_mode acc_mode; 5202 - void *tmpbuf = NULL; 5203 5534 int r; 5204 5535 5205 5536 r = mem_op_validate_common(mop, KVM_S390_MEMOP_F_INJECT_EXCEPTION | ··· 5221 5552 if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { 5222 5553 r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, 5223 5554 acc_mode, mop->key); 5224 - goto out_inject; 5225 - } 5226 - if (acc_mode == GACC_FETCH) { 5555 + } else if (acc_mode == GACC_FETCH) { 5227 5556 r = read_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, 5228 5557 mop->size, mop->key); 5229 - if (r) 5230 - goto out_inject; 5231 - if (copy_to_user(uaddr, tmpbuf, mop->size)) { 5232 - r = -EFAULT; 5233 - goto out_free; 5234 - } 5558 + if (!r && copy_to_user(uaddr, tmpbuf, mop->size)) 5559 + return -EFAULT; 5235 5560 } else { 5236 - if (copy_from_user(tmpbuf, uaddr, mop->size)) { 5237 - r = -EFAULT; 5238 - goto out_free; 5239 - } 5561 + if (copy_from_user(tmpbuf, uaddr, mop->size)) 5562 + return -EFAULT; 5240 5563 r = write_guest_with_key(vcpu, mop->gaddr, mop->ar, tmpbuf, 5241 5564 mop->size, mop->key); 5242 5565 } 5243 5566 5244 - out_inject: 5245 5567 if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) 5246 5568 kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); 5247 5569 5248 - out_free: 5249 - vfree(tmpbuf); 5250 5570 return r; 5251 5571 } 5252 5572 ··· 5425 5767 } 5426 5768 #ifdef CONFIG_KVM_S390_UCONTROL 5427 5769 case KVM_S390_UCAS_MAP: { 5428 - struct kvm_s390_ucas_mapping ucasmap; 5770 + struct kvm_s390_ucas_mapping ucas; 5429 5771 5430 - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { 5431 - r = -EFAULT; 5772 + r = -EFAULT; 5773 + if (copy_from_user(&ucas, argp, sizeof(ucas))) 5432 5774 break; 5433 - } 5434 5775 5435 - if (!kvm_is_ucontrol(vcpu->kvm)) { 5436 - r = -EINVAL; 5776 + r = -EINVAL; 5777 + if (!kvm_is_ucontrol(vcpu->kvm)) 5437 5778 break; 5438 - } 5779 + if (!IS_ALIGNED(ucas.user_addr | ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) 5780 + break; 5439 5781 5440 - r = gmap_map_segment(vcpu->arch.gmap, ucasmap.user_addr, 5441 - ucasmap.vcpu_addr, ucasmap.length); 5782 + r = gmap_ucas_map(vcpu->arch.gmap, gpa_to_gfn(ucas.user_addr), 5783 + gpa_to_gfn(ucas.vcpu_addr), 5784 + ucas.length >> _SEGMENT_SHIFT); 5442 5785 break; 5443 5786 } 5444 5787 case KVM_S390_UCAS_UNMAP: { 5445 - struct kvm_s390_ucas_mapping ucasmap; 5788 + struct kvm_s390_ucas_mapping ucas; 5446 5789 5447 - if (copy_from_user(&ucasmap, argp, sizeof(ucasmap))) { 5448 - r = -EFAULT; 5790 + r = -EFAULT; 5791 + if (copy_from_user(&ucas, argp, sizeof(ucas))) 5449 5792 break; 5450 - } 5451 5793 5452 - if (!kvm_is_ucontrol(vcpu->kvm)) { 5453 - r = -EINVAL; 5794 + r = -EINVAL; 5795 + if (!kvm_is_ucontrol(vcpu->kvm)) 5454 5796 break; 5455 - } 5797 + if (!IS_ALIGNED(ucas.vcpu_addr | ucas.length, _SEGMENT_SIZE)) 5798 + break; 5456 5799 5457 - r = gmap_unmap_segment(vcpu->arch.gmap, ucasmap.vcpu_addr, 5458 - ucasmap.length); 5800 + gmap_ucas_unmap(vcpu->arch.gmap, gpa_to_gfn(ucas.vcpu_addr), 5801 + ucas.length >> _SEGMENT_SHIFT); 5802 + r = 0; 5459 5803 break; 5460 5804 } 5461 5805 #endif ··· 5630 5970 const struct kvm_memory_slot *new, 5631 5971 enum kvm_mr_change change) 5632 5972 { 5973 + struct kvm_s390_mmu_cache *mc = NULL; 5633 5974 int rc = 0; 5634 5975 5635 - if (kvm_is_ucontrol(kvm)) 5976 + if (change == KVM_MR_FLAGS_ONLY) 5636 5977 return; 5637 5978 5638 - switch (change) { 5639 - case KVM_MR_DELETE: 5640 - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, 5641 - old->npages * PAGE_SIZE); 5642 - break; 5643 - case KVM_MR_MOVE: 5644 - rc = gmap_unmap_segment(kvm->arch.gmap, old->base_gfn * PAGE_SIZE, 5645 - old->npages * PAGE_SIZE); 5646 - if (rc) 5647 - break; 5648 - fallthrough; 5649 - case KVM_MR_CREATE: 5650 - rc = gmap_map_segment(kvm->arch.gmap, new->userspace_addr, 5651 - new->base_gfn * PAGE_SIZE, 5652 - new->npages * PAGE_SIZE); 5653 - break; 5654 - case KVM_MR_FLAGS_ONLY: 5655 - break; 5656 - default: 5657 - WARN(1, "Unknown KVM MR CHANGE: %d\n", change); 5979 + mc = kvm_s390_new_mmu_cache(); 5980 + if (!mc) { 5981 + rc = -ENOMEM; 5982 + goto out; 5658 5983 } 5984 + 5985 + scoped_guard(write_lock, &kvm->mmu_lock) { 5986 + switch (change) { 5987 + case KVM_MR_DELETE: 5988 + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); 5989 + break; 5990 + case KVM_MR_MOVE: 5991 + rc = dat_delete_slot(mc, kvm->arch.gmap->asce, old->base_gfn, old->npages); 5992 + if (rc) 5993 + break; 5994 + fallthrough; 5995 + case KVM_MR_CREATE: 5996 + rc = dat_create_slot(mc, kvm->arch.gmap->asce, new->base_gfn, new->npages); 5997 + break; 5998 + case KVM_MR_FLAGS_ONLY: 5999 + break; 6000 + default: 6001 + WARN(1, "Unknown KVM MR CHANGE: %d\n", change); 6002 + } 6003 + } 6004 + out: 5659 6005 if (rc) 5660 6006 pr_warn("failed to commit memory region\n"); 6007 + kvm_s390_free_mmu_cache(mc); 5661 6008 return; 5662 6009 } 5663 6010 ··· 5678 6011 */ 5679 6012 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 5680 6013 { 5681 - return false; 6014 + scoped_guard(read_lock, &kvm->mmu_lock) 6015 + return dat_test_age_gfn(kvm->arch.gmap->asce, range->start, range->end); 5682 6016 } 5683 6017 5684 6018 /** ··· 5692 6024 */ 5693 6025 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) 5694 6026 { 5695 - return false; 6027 + scoped_guard(read_lock, &kvm->mmu_lock) 6028 + return gmap_age_gfn(kvm->arch.gmap, range->start, range->end); 5696 6029 } 5697 6030 5698 6031 /** ··· 5710 6041 */ 5711 6042 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) 5712 6043 { 5713 - return false; 6044 + return gmap_unmap_gfn_range(kvm->arch.gmap, range->slot, range->start, range->end); 5714 6045 } 5715 6046 5716 6047 static inline unsigned long nonhyp_mask(int i)
+4 -15
arch/s390/kvm/kvm-s390.h
··· 19 19 #include <asm/facility.h> 20 20 #include <asm/processor.h> 21 21 #include <asm/sclp.h> 22 + #include "dat.h" 23 + #include "gmap.h" 22 24 23 25 #define KVM_S390_UCONTROL_MEMSLOT (KVM_USER_MEM_SLOTS + 0) 24 26 ··· 116 114 static inline int kvm_is_ucontrol(struct kvm *kvm) 117 115 { 118 116 #ifdef CONFIG_KVM_S390_UCONTROL 119 - if (kvm->arch.gmap) 120 - return 0; 121 - return 1; 117 + return test_bit(GMAP_FLAG_IS_UCONTROL, &kvm->arch.gmap->flags); 122 118 #else 123 119 return 0; 124 120 #endif ··· 440 440 /* implemented in vsie.c */ 441 441 int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); 442 442 void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu); 443 - void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, 444 - unsigned long end); 443 + void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end); 445 444 void kvm_s390_vsie_init(struct kvm *kvm); 446 445 void kvm_s390_vsie_destroy(struct kvm *kvm); 447 - int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level); 448 - 449 - /* implemented in gmap-vsie.c */ 450 - struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce, int edat_level); 451 446 452 447 /* implemented in sigp.c */ 453 448 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); ··· 464 469 void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm); 465 470 __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu); 466 471 int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rc, u16 *rrc); 467 - int __kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gfn_t gfn, gpa_t gaddr, unsigned int flags); 468 472 int __kvm_s390_mprotect_many(struct gmap *gmap, gpa_t gpa, u8 npages, unsigned int prot, 469 473 unsigned long bits); 470 - 471 - static inline int kvm_s390_handle_dat_fault(struct kvm_vcpu *vcpu, gpa_t gaddr, unsigned int flags) 472 - { 473 - return __kvm_s390_handle_dat_fault(vcpu, gpa_to_gfn(gaddr), gaddr, flags); 474 - } 475 474 476 475 bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu); 477 476
+78 -135
arch/s390/kvm/priv.c
··· 21 21 #include <asm/ebcdic.h> 22 22 #include <asm/sysinfo.h> 23 23 #include <asm/page-states.h> 24 - #include <asm/gmap.h> 25 24 #include <asm/ptrace.h> 26 25 #include <asm/sclp.h> 27 26 #include <asm/ap.h> 27 + #include <asm/gmap_helpers.h> 28 28 #include "gaccess.h" 29 29 #include "kvm-s390.h" 30 30 #include "trace.h" 31 + #include "gmap.h" 31 32 32 33 static int handle_ri(struct kvm_vcpu *vcpu) 33 34 { ··· 223 222 if (vcpu->arch.skey_enabled) 224 223 return 0; 225 224 226 - rc = s390_enable_skey(); 225 + rc = gmap_enable_skeys(vcpu->arch.gmap); 227 226 VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); 228 227 if (rc) 229 228 return rc; ··· 256 255 257 256 static int handle_iske(struct kvm_vcpu *vcpu) 258 257 { 259 - unsigned long gaddr, vmaddr; 260 - unsigned char key; 258 + unsigned long gaddr; 261 259 int reg1, reg2; 262 - bool unlocked; 260 + union skey key; 263 261 int rc; 264 262 265 263 vcpu->stat.instruction_iske++; ··· 275 275 gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; 276 276 gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); 277 277 gaddr = kvm_s390_real_to_abs(vcpu, gaddr); 278 - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); 279 - if (kvm_is_error_hva(vmaddr)) 280 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 281 - retry: 282 - unlocked = false; 283 - mmap_read_lock(current->mm); 284 - rc = get_guest_storage_key(current->mm, vmaddr, &key); 285 - 286 - if (rc) { 287 - rc = fixup_user_fault(current->mm, vmaddr, 288 - FAULT_FLAG_WRITE, &unlocked); 289 - if (!rc) { 290 - mmap_read_unlock(current->mm); 291 - goto retry; 292 - } 293 - } 294 - mmap_read_unlock(current->mm); 295 - if (rc == -EFAULT) 296 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 278 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) 279 + rc = dat_get_storage_key(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr), &key); 280 + if (rc > 0) 281 + return kvm_s390_inject_program_int(vcpu, rc); 297 282 if (rc < 0) 298 283 return rc; 299 284 vcpu->run->s.regs.gprs[reg1] &= ~0xff; 300 - vcpu->run->s.regs.gprs[reg1] |= key; 285 + vcpu->run->s.regs.gprs[reg1] |= key.skey; 301 286 return 0; 302 287 } 303 288 304 289 static int handle_rrbe(struct kvm_vcpu *vcpu) 305 290 { 306 - unsigned long vmaddr, gaddr; 291 + unsigned long gaddr; 307 292 int reg1, reg2; 308 - bool unlocked; 309 293 int rc; 310 294 311 295 vcpu->stat.instruction_rrbe++; ··· 306 322 gaddr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; 307 323 gaddr = kvm_s390_logical_to_effective(vcpu, gaddr); 308 324 gaddr = kvm_s390_real_to_abs(vcpu, gaddr); 309 - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(gaddr)); 310 - if (kvm_is_error_hva(vmaddr)) 311 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 312 - retry: 313 - unlocked = false; 314 - mmap_read_lock(current->mm); 315 - rc = reset_guest_reference_bit(current->mm, vmaddr); 316 - if (rc < 0) { 317 - rc = fixup_user_fault(current->mm, vmaddr, 318 - FAULT_FLAG_WRITE, &unlocked); 319 - if (!rc) { 320 - mmap_read_unlock(current->mm); 321 - goto retry; 322 - } 323 - } 324 - mmap_read_unlock(current->mm); 325 - if (rc == -EFAULT) 326 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 325 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) 326 + rc = dat_reset_reference_bit(vcpu->arch.gmap->asce, gpa_to_gfn(gaddr)); 327 + if (rc > 0) 328 + return kvm_s390_inject_program_int(vcpu, rc); 327 329 if (rc < 0) 328 330 return rc; 329 331 kvm_s390_set_psw_cc(vcpu, rc); ··· 324 354 { 325 355 unsigned char m3 = vcpu->arch.sie_block->ipb >> 28; 326 356 unsigned long start, end; 327 - unsigned char key, oldkey; 357 + union skey key, oldkey; 328 358 int reg1, reg2; 329 - bool unlocked; 330 359 int rc; 331 360 332 361 vcpu->stat.instruction_sske++; ··· 346 377 347 378 kvm_s390_get_regs_rre(vcpu, &reg1, &reg2); 348 379 349 - key = vcpu->run->s.regs.gprs[reg1] & 0xfe; 380 + key.skey = vcpu->run->s.regs.gprs[reg1] & 0xfe; 350 381 start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; 351 382 start = kvm_s390_logical_to_effective(vcpu, start); 352 383 if (m3 & SSKE_MB) { ··· 358 389 } 359 390 360 391 while (start != end) { 361 - unsigned long vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); 362 - unlocked = false; 363 - 364 - if (kvm_is_error_hva(vmaddr)) 365 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 366 - 367 - mmap_read_lock(current->mm); 368 - rc = cond_set_guest_storage_key(current->mm, vmaddr, key, &oldkey, 369 - m3 & SSKE_NQ, m3 & SSKE_MR, 370 - m3 & SSKE_MC); 371 - 372 - if (rc < 0) { 373 - rc = fixup_user_fault(current->mm, vmaddr, 374 - FAULT_FLAG_WRITE, &unlocked); 375 - rc = !rc ? -EAGAIN : rc; 392 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { 393 + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, 394 + gpa_to_gfn(start), key, &oldkey, 395 + m3 & SSKE_NQ, m3 & SSKE_MR, m3 & SSKE_MC); 376 396 } 377 - mmap_read_unlock(current->mm); 378 - if (rc == -EFAULT) 397 + if (rc > 1) 379 398 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 380 - if (rc == -EAGAIN) 399 + if (rc == -ENOMEM) { 400 + kvm_s390_mmu_cache_topup(vcpu->arch.mc); 381 401 continue; 402 + } 382 403 if (rc < 0) 383 404 return rc; 384 405 start += PAGE_SIZE; ··· 381 422 } else { 382 423 kvm_s390_set_psw_cc(vcpu, rc); 383 424 vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL; 384 - vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8; 425 + vcpu->run->s.regs.gprs[reg1] |= (u64)oldkey.skey << 8; 385 426 } 386 427 } 387 428 if (m3 & SSKE_MB) { ··· 1041 1082 bool mr = false, mc = false, nq; 1042 1083 int reg1, reg2; 1043 1084 unsigned long start, end; 1044 - unsigned char key; 1085 + union skey key; 1045 1086 1046 1087 vcpu->stat.instruction_pfmf++; 1047 1088 ··· 1069 1110 } 1070 1111 1071 1112 nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ; 1072 - key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; 1113 + key.skey = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY; 1073 1114 start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK; 1074 1115 start = kvm_s390_logical_to_effective(vcpu, start); 1075 1116 ··· 1100 1141 } 1101 1142 1102 1143 while (start != end) { 1103 - unsigned long vmaddr; 1104 - bool unlocked = false; 1105 - 1106 - /* Translate guest address to host address */ 1107 - vmaddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start)); 1108 - if (kvm_is_error_hva(vmaddr)) 1109 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 1110 - 1111 1144 if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) { 1112 1145 if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE)) 1113 1146 return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); ··· 1110 1159 1111 1160 if (rc) 1112 1161 return rc; 1113 - mmap_read_lock(current->mm); 1114 - rc = cond_set_guest_storage_key(current->mm, vmaddr, 1115 - key, NULL, nq, mr, mc); 1116 - if (rc < 0) { 1117 - rc = fixup_user_fault(current->mm, vmaddr, 1118 - FAULT_FLAG_WRITE, &unlocked); 1119 - rc = !rc ? -EAGAIN : rc; 1162 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) { 1163 + rc = dat_cond_set_storage_key(vcpu->arch.mc, vcpu->arch.gmap->asce, 1164 + gpa_to_gfn(start), key, 1165 + NULL, nq, mr, mc); 1120 1166 } 1121 - mmap_read_unlock(current->mm); 1122 - if (rc == -EFAULT) 1123 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 1124 - if (rc == -EAGAIN) 1167 + if (rc > 1) 1168 + return kvm_s390_inject_program_int(vcpu, rc); 1169 + if (rc == -ENOMEM) { 1170 + kvm_s390_mmu_cache_topup(vcpu->arch.mc); 1125 1171 continue; 1172 + } 1126 1173 if (rc < 0) 1127 1174 return rc; 1128 1175 } ··· 1144 1195 static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc) 1145 1196 { 1146 1197 int r1, r2, nappended, entries; 1147 - unsigned long gfn, hva, res, pgstev, ptev; 1198 + union essa_state state; 1148 1199 unsigned long *cbrlo; 1200 + unsigned long gfn; 1201 + bool dirtied; 1149 1202 1150 1203 /* 1151 1204 * We don't need to set SD.FPF.SK to 1 here, because if we have a ··· 1156 1205 1157 1206 kvm_s390_get_regs_rre(vcpu, &r1, &r2); 1158 1207 gfn = vcpu->run->s.regs.gprs[r2] >> PAGE_SHIFT; 1159 - hva = gfn_to_hva(vcpu->kvm, gfn); 1160 1208 entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; 1161 1209 1162 - if (kvm_is_error_hva(hva)) 1163 - return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); 1164 - 1165 - nappended = pgste_perform_essa(vcpu->kvm->mm, hva, orc, &ptev, &pgstev); 1166 - if (nappended < 0) { 1167 - res = orc ? 0x10 : 0; 1168 - vcpu->run->s.regs.gprs[r1] = res; /* Exception Indication */ 1210 + nappended = dat_perform_essa(vcpu->arch.gmap->asce, gfn, orc, &state, &dirtied); 1211 + vcpu->run->s.regs.gprs[r1] = state.val; 1212 + if (nappended < 0) 1169 1213 return 0; 1170 - } 1171 - res = (pgstev & _PGSTE_GPS_USAGE_MASK) >> 22; 1172 - /* 1173 - * Set the block-content state part of the result. 0 means resident, so 1174 - * nothing to do if the page is valid. 2 is for preserved pages 1175 - * (non-present and non-zero), and 3 for zero pages (non-present and 1176 - * zero). 1177 - */ 1178 - if (ptev & _PAGE_INVALID) { 1179 - res |= 2; 1180 - if (pgstev & _PGSTE_GPS_ZERO) 1181 - res |= 1; 1182 - } 1183 - if (pgstev & _PGSTE_GPS_NODAT) 1184 - res |= 0x20; 1185 - vcpu->run->s.regs.gprs[r1] = res; 1186 1214 /* 1187 1215 * It is possible that all the normal 511 slots were full, in which case 1188 1216 * we will now write in the 512th slot, which is reserved for host use. ··· 1173 1243 cbrlo[entries] = gfn << PAGE_SHIFT; 1174 1244 } 1175 1245 1176 - if (orc) { 1177 - struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn); 1178 - 1179 - /* Increment only if we are really flipping the bit */ 1180 - if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms))) 1181 - atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); 1182 - } 1246 + if (dirtied) 1247 + atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages); 1183 1248 1184 1249 return nappended; 1250 + } 1251 + 1252 + static void _essa_clear_cbrl(struct kvm_vcpu *vcpu, unsigned long *cbrl, int len) 1253 + { 1254 + union crste *crstep; 1255 + union pgste pgste; 1256 + union pte *ptep; 1257 + int i; 1258 + 1259 + lockdep_assert_held(&vcpu->kvm->mmu_lock); 1260 + 1261 + for (i = 0; i < len; i++) { 1262 + if (dat_entry_walk(NULL, gpa_to_gfn(cbrl[i]), vcpu->arch.gmap->asce, 1263 + 0, TABLE_TYPE_PAGE_TABLE, &crstep, &ptep)) 1264 + continue; 1265 + if (!ptep || ptep->s.pr) 1266 + continue; 1267 + pgste = pgste_get_lock(ptep); 1268 + if (pgste.usage == PGSTE_GPS_USAGE_UNUSED || pgste.zero) 1269 + gmap_helper_zap_one_page(vcpu->kvm->mm, cbrl[i]); 1270 + pgste_set_unlock(ptep, pgste); 1271 + } 1185 1272 } 1186 1273 1187 1274 static int handle_essa(struct kvm_vcpu *vcpu) ··· 1208 1261 /* entries expected to be 1FF */ 1209 1262 int entries = (vcpu->arch.sie_block->cbrlo & ~PAGE_MASK) >> 3; 1210 1263 unsigned long *cbrlo; 1211 - struct gmap *gmap; 1212 1264 int i, orc; 1213 1265 1214 1266 VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); 1215 - gmap = vcpu->arch.gmap; 1216 1267 vcpu->stat.instruction_essa++; 1217 1268 if (!vcpu->kvm->arch.use_cmma) 1218 1269 return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); ··· 1234 1289 * value really needs to be written to; if the value is 1235 1290 * already correct, we do nothing and avoid the lock. 1236 1291 */ 1237 - if (vcpu->kvm->mm->context.uses_cmm == 0) { 1238 - mmap_write_lock(vcpu->kvm->mm); 1239 - vcpu->kvm->mm->context.uses_cmm = 1; 1240 - mmap_write_unlock(vcpu->kvm->mm); 1241 - } 1292 + set_bit(GMAP_FLAG_USES_CMM, &vcpu->arch.gmap->flags); 1242 1293 /* 1243 1294 * If we are here, we are supposed to have CMMA enabled in 1244 1295 * the SIE block. Enabling CMMA works on a per-CPU basis, ··· 1248 1307 /* Retry the ESSA instruction */ 1249 1308 kvm_s390_retry_instr(vcpu); 1250 1309 } else { 1251 - mmap_read_lock(vcpu->kvm->mm); 1252 - i = __do_essa(vcpu, orc); 1253 - mmap_read_unlock(vcpu->kvm->mm); 1310 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) 1311 + i = __do_essa(vcpu, orc); 1254 1312 if (i < 0) 1255 1313 return i; 1256 1314 /* Account for the possible extra cbrl entry */ 1257 1315 entries += i; 1258 1316 } 1259 - vcpu->arch.sie_block->cbrlo &= PAGE_MASK; /* reset nceo */ 1317 + /* reset nceo */ 1318 + vcpu->arch.sie_block->cbrlo &= PAGE_MASK; 1260 1319 cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo); 1261 - mmap_read_lock(gmap->mm); 1262 - for (i = 0; i < entries; ++i) 1263 - __gmap_zap(gmap, cbrlo[i]); 1264 - mmap_read_unlock(gmap->mm); 1320 + 1321 + mmap_read_lock(vcpu->kvm->mm); 1322 + scoped_guard(read_lock, &vcpu->kvm->mmu_lock) 1323 + _essa_clear_cbrl(vcpu, cbrlo, entries); 1324 + mmap_read_unlock(vcpu->kvm->mm); 1325 + 1265 1326 return 0; 1266 1327 } 1267 1328
+116 -58
arch/s390/kvm/pv.c
··· 12 12 #include <linux/minmax.h> 13 13 #include <linux/pagemap.h> 14 14 #include <linux/sched/signal.h> 15 - #include <asm/gmap.h> 16 15 #include <asm/uv.h> 17 16 #include <asm/mman.h> 18 17 #include <linux/pagewalk.h> 19 18 #include <linux/sched/mm.h> 20 19 #include <linux/mmu_notifier.h> 21 20 #include "kvm-s390.h" 21 + #include "dat.h" 22 + #include "gaccess.h" 23 + #include "gmap.h" 24 + #include "faultin.h" 22 25 23 26 bool kvm_s390_pv_is_protected(struct kvm *kvm) 24 27 { ··· 38 35 EXPORT_SYMBOL_GPL(kvm_s390_pv_cpu_is_protected); 39 36 40 37 /** 38 + * should_export_before_import() - Determine whether an export is needed 39 + * before an import-like operation. 40 + * @uvcb: The Ultravisor control block of the UVC to be performed. 41 + * @mm: The mm of the process. 42 + * 43 + * Returns whether an export is needed before every import-like operation. 44 + * This is needed for shared pages, which don't trigger a secure storage 45 + * exception when accessed from a different guest. 46 + * 47 + * Although considered as one, the Unpin Page UVC is not an actual import, 48 + * so it is not affected. 49 + * 50 + * No export is needed also when there is only one protected VM, because the 51 + * page cannot belong to the wrong VM in that case (there is no "other VM" 52 + * it can belong to). 53 + * 54 + * Return: %true if an export is needed before every import, otherwise %false. 55 + */ 56 + static bool should_export_before_import(struct uv_cb_header *uvcb, struct mm_struct *mm) 57 + { 58 + /* 59 + * The misc feature indicates, among other things, that importing a 60 + * shared page from a different protected VM will automatically also 61 + * transfer its ownership. 62 + */ 63 + if (uv_has_feature(BIT_UV_FEAT_MISC)) 64 + return false; 65 + if (uvcb->cmd == UVC_CMD_UNPIN_PAGE_SHARED) 66 + return false; 67 + return atomic_read(&mm->context.protected_count) > 1; 68 + } 69 + 70 + struct pv_make_secure { 71 + void *uvcb; 72 + struct folio *folio; 73 + int rc; 74 + bool needs_export; 75 + }; 76 + 77 + static int __kvm_s390_pv_make_secure(struct guest_fault *f, struct folio *folio) 78 + { 79 + struct pv_make_secure *priv = f->priv; 80 + int rc; 81 + 82 + if (priv->needs_export) 83 + uv_convert_from_secure(folio_to_phys(folio)); 84 + 85 + if (folio_test_hugetlb(folio)) 86 + return -EFAULT; 87 + if (folio_test_large(folio)) 88 + return -E2BIG; 89 + 90 + if (!f->page) 91 + folio_get(folio); 92 + rc = __make_folio_secure(folio, priv->uvcb); 93 + if (!f->page) 94 + folio_put(folio); 95 + 96 + return rc; 97 + } 98 + 99 + static void _kvm_s390_pv_make_secure(struct guest_fault *f) 100 + { 101 + struct pv_make_secure *priv = f->priv; 102 + struct folio *folio; 103 + 104 + folio = pfn_folio(f->pfn); 105 + priv->rc = -EAGAIN; 106 + if (folio_trylock(folio)) { 107 + priv->rc = __kvm_s390_pv_make_secure(f, folio); 108 + if (priv->rc == -E2BIG || priv->rc == -EBUSY) { 109 + priv->folio = folio; 110 + folio_get(folio); 111 + } 112 + folio_unlock(folio); 113 + } 114 + } 115 + 116 + /** 41 117 * kvm_s390_pv_make_secure() - make one guest page secure 42 118 * @kvm: the guest 43 119 * @gaddr: the guest address that needs to be made secure ··· 127 45 */ 128 46 int kvm_s390_pv_make_secure(struct kvm *kvm, unsigned long gaddr, void *uvcb) 129 47 { 130 - unsigned long vmaddr; 48 + struct pv_make_secure priv = { .uvcb = uvcb }; 49 + struct guest_fault f = { 50 + .write_attempt = true, 51 + .gfn = gpa_to_gfn(gaddr), 52 + .callback = _kvm_s390_pv_make_secure, 53 + .priv = &priv, 54 + }; 55 + int rc; 131 56 132 57 lockdep_assert_held(&kvm->srcu); 133 58 134 - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(gaddr)); 135 - if (kvm_is_error_hva(vmaddr)) 136 - return -EFAULT; 137 - return make_hva_secure(kvm->mm, vmaddr, uvcb); 59 + priv.needs_export = should_export_before_import(uvcb, kvm->mm); 60 + 61 + scoped_guard(mutex, &kvm->arch.pv.import_lock) { 62 + rc = kvm_s390_faultin_gfn(NULL, kvm, &f); 63 + 64 + if (!rc) { 65 + rc = priv.rc; 66 + if (priv.folio) { 67 + rc = s390_wiggle_split_folio(kvm->mm, priv.folio); 68 + if (!rc) 69 + rc = -EAGAIN; 70 + } 71 + } 72 + } 73 + if (priv.folio) 74 + folio_put(priv.folio); 75 + return rc; 138 76 } 139 77 140 78 int kvm_s390_pv_convert_to_secure(struct kvm *kvm, unsigned long gaddr) ··· 401 299 return 0; 402 300 } 403 301 404 - /** 405 - * kvm_s390_destroy_lower_2g - Destroy the first 2GB of protected guest memory. 406 - * @kvm: the VM whose memory is to be cleared. 407 - * 408 - * Destroy the first 2GB of guest memory, to avoid prefix issues after reboot. 409 - * The CPUs of the protected VM need to be destroyed beforehand. 410 - */ 411 - static void kvm_s390_destroy_lower_2g(struct kvm *kvm) 412 - { 413 - const unsigned long pages_2g = SZ_2G / PAGE_SIZE; 414 - struct kvm_memory_slot *slot; 415 - unsigned long len; 416 - int srcu_idx; 417 - 418 - srcu_idx = srcu_read_lock(&kvm->srcu); 419 - 420 - /* Take the memslot containing guest absolute address 0 */ 421 - slot = gfn_to_memslot(kvm, 0); 422 - /* Clear all slots or parts thereof that are below 2GB */ 423 - while (slot && slot->base_gfn < pages_2g) { 424 - len = min_t(u64, slot->npages, pages_2g - slot->base_gfn) * PAGE_SIZE; 425 - s390_uv_destroy_range(kvm->mm, slot->userspace_addr, slot->userspace_addr + len); 426 - /* Take the next memslot */ 427 - slot = gfn_to_memslot(kvm, slot->base_gfn + slot->npages); 428 - } 429 - 430 - srcu_read_unlock(&kvm->srcu, srcu_idx); 431 - } 432 - 433 302 static int kvm_s390_pv_deinit_vm_fast(struct kvm *kvm, u16 *rc, u16 *rrc) 434 303 { 435 304 struct uv_cb_destroy_fast uvcb = { ··· 415 342 *rc = uvcb.header.rc; 416 343 if (rrc) 417 344 *rrc = uvcb.header.rrc; 418 - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); 419 345 KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM FAST: rc %x rrc %x", 420 346 uvcb.header.rc, uvcb.header.rrc); 421 347 WARN_ONCE(cc && uvcb.header.rc != 0x104, ··· 463 391 return -EINVAL; 464 392 465 393 /* Guest with segment type ASCE, refuse to destroy asynchronously */ 466 - if ((kvm->arch.gmap->asce & _ASCE_TYPE_MASK) == _ASCE_TYPE_SEGMENT) 394 + if (kvm->arch.gmap->asce.dt == TABLE_TYPE_SEGMENT) 467 395 return -EINVAL; 468 396 469 397 priv = kzalloc(sizeof(*priv), GFP_KERNEL); ··· 476 404 priv->stor_var = kvm->arch.pv.stor_var; 477 405 priv->stor_base = kvm->arch.pv.stor_base; 478 406 priv->handle = kvm_s390_pv_get_handle(kvm); 479 - priv->old_gmap_table = (unsigned long)kvm->arch.gmap->table; 480 - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); 407 + priv->old_gmap_table = (unsigned long)dereference_asce(kvm->arch.gmap->asce); 481 408 if (s390_replace_asce(kvm->arch.gmap)) 482 409 res = -ENOMEM; 483 410 } ··· 486 415 return res; 487 416 } 488 417 489 - kvm_s390_destroy_lower_2g(kvm); 418 + gmap_pv_destroy_range(kvm->arch.gmap, 0, gpa_to_gfn(SZ_2G), false); 490 419 kvm_s390_clear_pv_state(kvm); 491 420 kvm->arch.pv.set_aside = priv; 492 421 ··· 520 449 521 450 cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), 522 451 UVC_CMD_DESTROY_SEC_CONF, rc, rrc); 523 - WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); 524 452 if (!cc) { 525 453 atomic_dec(&kvm->mm->context.protected_count); 526 454 kvm_s390_pv_dealloc_vm(kvm); ··· 602 532 * cleanup has been performed. 603 533 */ 604 534 if (need_zap && mmget_not_zero(kvm->mm)) { 605 - s390_uv_destroy_range(kvm->mm, 0, TASK_SIZE); 535 + gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), false); 606 536 mmput(kvm->mm); 607 537 } 608 538 ··· 640 570 return -EINVAL; 641 571 642 572 /* When a fatal signal is received, stop immediately */ 643 - if (s390_uv_destroy_range_interruptible(kvm->mm, 0, TASK_SIZE_MAX)) 573 + if (gmap_pv_destroy_range(kvm->arch.gmap, 0, asce_end(kvm->arch.gmap->asce), true)) 644 574 goto done; 645 575 if (kvm_s390_pv_dispose_one_leftover(kvm, p, rc, rrc)) 646 576 ret = -EIO; ··· 679 609 r = kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); 680 610 if (!r && is_destroy_fast_available() && kvm_s390_pv_get_handle(kvm)) 681 611 kvm_s390_pv_deinit_vm_fast(kvm, &dummy, &dummy); 612 + set_bit(GMAP_FLAG_EXPORT_ON_UNMAP, &kvm->arch.gmap->flags); 682 613 } 683 614 684 615 static const struct mmu_notifier_ops kvm_s390_pv_mmu_notifier_ops = { ··· 713 642 /* Inputs */ 714 643 uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ 715 644 uvcb.guest_stor_len = kvm->arch.pv.guest_len; 716 - uvcb.guest_asce = kvm->arch.gmap->asce; 645 + uvcb.guest_asce = kvm->arch.gmap->asce.val; 717 646 uvcb.guest_sca = virt_to_phys(kvm->arch.sca); 718 647 uvcb.conf_base_stor_origin = 719 648 virt_to_phys((void *)kvm->arch.pv.stor_base); ··· 740 669 } 741 670 return -EIO; 742 671 } 743 - kvm->arch.gmap->guest_handle = uvcb.guest_handle; 744 672 return 0; 745 673 } 746 674 ··· 774 704 .tweak[1] = offset, 775 705 }; 776 706 int ret = kvm_s390_pv_make_secure(kvm, addr, &uvcb); 777 - unsigned long vmaddr; 778 - bool unlocked; 779 707 780 708 *rc = uvcb.header.rc; 781 709 *rrc = uvcb.header.rrc; 782 710 783 711 if (ret == -ENXIO) { 784 - mmap_read_lock(kvm->mm); 785 - vmaddr = gfn_to_hva(kvm, gpa_to_gfn(addr)); 786 - if (kvm_is_error_hva(vmaddr)) { 787 - ret = -EFAULT; 788 - } else { 789 - ret = fixup_user_fault(kvm->mm, vmaddr, FAULT_FLAG_WRITE, &unlocked); 790 - if (!ret) 791 - ret = __gmap_link(kvm->arch.gmap, addr, vmaddr); 792 - } 793 - mmap_read_unlock(kvm->mm); 712 + ret = kvm_s390_faultin_gfn_simple(NULL, kvm, gpa_to_gfn(addr), true); 794 713 if (!ret) 795 714 return -EAGAIN; 796 - return ret; 797 715 } 798 716 799 717 if (ret && ret != -EAGAIN)
+88 -80
arch/s390/kvm/vsie.c
··· 15 15 #include <linux/io.h> 16 16 #include <linux/mman.h> 17 17 18 - #include <asm/gmap.h> 19 18 #include <asm/mmu_context.h> 20 19 #include <asm/sclp.h> 21 20 #include <asm/nmi.h> ··· 22 23 #include <asm/facility.h> 23 24 #include "kvm-s390.h" 24 25 #include "gaccess.h" 26 + #include "gmap.h" 25 27 26 28 enum vsie_page_flags { 27 29 VSIE_PAGE_IN_USE = 0, ··· 41 41 * are reused conditionally, should be accessed via READ_ONCE. 42 42 */ 43 43 struct kvm_s390_sie_block *scb_o; /* 0x0218 */ 44 - /* the shadow gmap in use by the vsie_page */ 45 - struct gmap *gmap; /* 0x0220 */ 44 + /* 45 + * Flags: must be set/cleared atomically after the vsie page can be 46 + * looked up by other CPUs. 47 + */ 48 + unsigned long flags; /* 0x0220 */ 46 49 /* address of the last reported fault to guest2 */ 47 50 unsigned long fault_addr; /* 0x0228 */ 48 51 /* calculated guest addresses of satellite control blocks */ ··· 60 57 * radix tree. 61 58 */ 62 59 gpa_t scb_gpa; /* 0x0258 */ 63 - /* 64 - * Flags: must be set/cleared atomically after the vsie page can be 65 - * looked up by other CPUs. 66 - */ 67 - unsigned long flags; /* 0x0260 */ 68 - __u8 reserved[0x0700 - 0x0268]; /* 0x0268 */ 60 + /* the shadow gmap in use by the vsie_page */ 61 + struct gmap_cache gmap_cache; /* 0x0260 */ 62 + __u8 reserved[0x0700 - 0x0278]; /* 0x0278 */ 69 63 struct kvm_s390_crypto_cb crycb; /* 0x0700 */ 70 64 __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */ 71 65 }; 72 66 73 - /** 74 - * gmap_shadow_valid() - check if a shadow guest address space matches the 75 - * given properties and is still valid 76 - * @sg: pointer to the shadow guest address space structure 77 - * @asce: ASCE for which the shadow table is requested 78 - * @edat_level: edat level to be used for the shadow translation 79 - * 80 - * Returns 1 if the gmap shadow is still valid and matches the given 81 - * properties, the caller can continue using it. Returns 0 otherwise; the 82 - * caller has to request a new shadow gmap in this case. 83 - */ 84 - int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level) 85 - { 86 - if (sg->removed) 87 - return 0; 88 - return sg->orig_asce == asce && sg->edat_level == edat_level; 89 - } 67 + static_assert(sizeof(struct vsie_page) == PAGE_SIZE); 90 68 91 69 /* trigger a validity icpt for the given scb */ 92 70 static int set_validity_icpt(struct kvm_s390_sie_block *scb, ··· 596 612 return rc; 597 613 } 598 614 599 - void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start, 600 - unsigned long end) 615 + void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, gpa_t start, gpa_t end) 601 616 { 602 - struct kvm *kvm = gmap->private; 603 - struct vsie_page *cur; 617 + struct vsie_page *cur, *next; 604 618 unsigned long prefix; 605 - int i; 606 619 607 - if (!gmap_is_shadow(gmap)) 608 - return; 620 + KVM_BUG_ON(!test_bit(GMAP_FLAG_SHADOW, &gmap->flags), gmap->kvm); 609 621 /* 610 622 * Only new shadow blocks are added to the list during runtime, 611 623 * therefore we can safely reference them all the time. 612 624 */ 613 - for (i = 0; i < kvm->arch.vsie.page_count; i++) { 614 - cur = READ_ONCE(kvm->arch.vsie.pages[i]); 615 - if (!cur) 616 - continue; 617 - if (READ_ONCE(cur->gmap) != gmap) 618 - continue; 625 + list_for_each_entry_safe(cur, next, &gmap->scb_users, gmap_cache.list) { 619 626 prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT; 620 627 /* with mso/msl, the prefix lies at an offset */ 621 628 prefix += cur->scb_s.mso; ··· 642 667 /* with mso/msl, the prefix lies at offset *mso* */ 643 668 prefix += scb_s->mso; 644 669 645 - rc = kvm_s390_shadow_fault(vcpu, sg, prefix, NULL); 670 + rc = gaccess_shadow_fault(vcpu, sg, prefix, NULL, true); 646 671 if (!rc && (scb_s->ecb & ECB_TE)) 647 - rc = kvm_s390_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL); 672 + rc = gaccess_shadow_fault(vcpu, sg, prefix + PAGE_SIZE, NULL, true); 648 673 /* 649 674 * We don't have to mprotect, we will be called for all unshadows. 650 675 * SIE will detect if protection applies and trigger a validity. ··· 927 952 */ 928 953 static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) 929 954 { 955 + bool wr = kvm_s390_cur_gmap_fault_is_write(); 930 956 int rc; 931 957 932 958 if ((current->thread.gmap_int_code & PGM_INT_CODE_MASK) == PGM_PROTECTION) ··· 935 959 return inject_fault(vcpu, PGM_PROTECTION, 936 960 current->thread.gmap_teid.addr * PAGE_SIZE, 1); 937 961 938 - rc = kvm_s390_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL); 962 + rc = gaccess_shadow_fault(vcpu, sg, current->thread.gmap_teid.addr * PAGE_SIZE, NULL, wr); 939 963 if (rc > 0) { 940 964 rc = inject_fault(vcpu, rc, 941 - current->thread.gmap_teid.addr * PAGE_SIZE, 942 - kvm_s390_cur_gmap_fault_is_write()); 965 + current->thread.gmap_teid.addr * PAGE_SIZE, wr); 943 966 if (rc >= 0) 944 967 vsie_page->fault_addr = current->thread.gmap_teid.addr * PAGE_SIZE; 945 968 } ··· 954 979 static void handle_last_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) 955 980 { 956 981 if (vsie_page->fault_addr) 957 - kvm_s390_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL); 982 + gaccess_shadow_fault(vcpu, sg, vsie_page->fault_addr, NULL, true); 958 983 vsie_page->fault_addr = 0; 959 984 } 960 985 ··· 1039 1064 static int vsie_handle_mvpg(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page, struct gmap *sg) 1040 1065 { 1041 1066 struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; 1042 - unsigned long pei_dest, pei_src, src, dest, mask, prefix; 1067 + unsigned long src, dest, mask, prefix; 1043 1068 u64 *pei_block = &vsie_page->scb_o->mcic; 1069 + union mvpg_pei pei_dest, pei_src; 1044 1070 int edat, rc_dest, rc_src; 1045 1071 union ctlreg0 cr0; 1046 1072 ··· 1055 1079 src = vsie_get_register(vcpu, vsie_page, scb_s->ipb >> 16) & mask; 1056 1080 src = _kvm_s390_real_to_abs(prefix, src) + scb_s->mso; 1057 1081 1058 - rc_dest = kvm_s390_shadow_fault(vcpu, sg, dest, &pei_dest); 1059 - rc_src = kvm_s390_shadow_fault(vcpu, sg, src, &pei_src); 1082 + rc_dest = gaccess_shadow_fault(vcpu, sg, dest, &pei_dest, true); 1083 + rc_src = gaccess_shadow_fault(vcpu, sg, src, &pei_src, false); 1060 1084 /* 1061 1085 * Either everything went well, or something non-critical went wrong 1062 1086 * e.g. because of a race. In either case, simply retry. ··· 1091 1115 rc_src = rc_src != PGM_PAGE_TRANSLATION ? rc_src : 0; 1092 1116 } 1093 1117 if (!rc_dest && !rc_src) { 1094 - pei_block[0] = pei_dest; 1095 - pei_block[1] = pei_src; 1118 + pei_block[0] = pei_dest.val; 1119 + pei_block[1] = pei_src.val; 1096 1120 return 1; 1097 1121 } 1098 1122 ··· 1163 1187 goto xfer_to_guest_mode_check; 1164 1188 } 1165 1189 guest_timing_enter_irqoff(); 1166 - rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce); 1190 + rc = kvm_s390_enter_exit_sie(scb_s, vcpu->run->s.regs.gprs, sg->asce.val); 1167 1191 guest_timing_exit_irqoff(); 1168 1192 local_irq_enable(); 1169 1193 } ··· 1213 1237 1214 1238 static void release_gmap_shadow(struct vsie_page *vsie_page) 1215 1239 { 1216 - if (vsie_page->gmap) 1217 - gmap_put(vsie_page->gmap); 1218 - WRITE_ONCE(vsie_page->gmap, NULL); 1240 + struct gmap *gmap = vsie_page->gmap_cache.gmap; 1241 + 1242 + lockdep_assert_held(&gmap->kvm->arch.gmap->children_lock); 1243 + 1244 + list_del(&vsie_page->gmap_cache.list); 1245 + vsie_page->gmap_cache.gmap = NULL; 1219 1246 prefix_unmapped(vsie_page); 1247 + 1248 + if (list_empty(&gmap->scb_users)) { 1249 + gmap_remove_child(gmap); 1250 + gmap_put(gmap); 1251 + } 1220 1252 } 1221 1253 1222 - static int acquire_gmap_shadow(struct kvm_vcpu *vcpu, 1223 - struct vsie_page *vsie_page) 1254 + static struct gmap *acquire_gmap_shadow(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) 1224 1255 { 1225 - unsigned long asce; 1226 1256 union ctlreg0 cr0; 1227 1257 struct gmap *gmap; 1258 + union asce asce; 1228 1259 int edat; 1229 1260 1230 - asce = vcpu->arch.sie_block->gcr[1]; 1261 + asce.val = vcpu->arch.sie_block->gcr[1]; 1231 1262 cr0.val = vcpu->arch.sie_block->gcr[0]; 1232 1263 edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8); 1233 1264 edat += edat && test_kvm_facility(vcpu->kvm, 78); 1234 1265 1235 - /* 1236 - * ASCE or EDAT could have changed since last icpt, or the gmap 1237 - * we're holding has been unshadowed. If the gmap is still valid, 1238 - * we can safely reuse it. 1239 - */ 1240 - if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat)) { 1241 - vcpu->kvm->stat.gmap_shadow_reuse++; 1242 - return 0; 1266 + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { 1267 + gmap = vsie_page->gmap_cache.gmap; 1268 + if (gmap) { 1269 + /* 1270 + * ASCE or EDAT could have changed since last icpt, or the gmap 1271 + * we're holding has been unshadowed. If the gmap is still valid, 1272 + * we can safely reuse it. 1273 + */ 1274 + if (gmap_is_shadow_valid(gmap, asce, edat)) { 1275 + vcpu->kvm->stat.gmap_shadow_reuse++; 1276 + gmap_get(gmap); 1277 + return gmap; 1278 + } 1279 + /* release the old shadow and mark the prefix as unmapped */ 1280 + release_gmap_shadow(vsie_page); 1281 + } 1243 1282 } 1244 - 1245 - /* release the old shadow - if any, and mark the prefix as unmapped */ 1246 - release_gmap_shadow(vsie_page); 1247 - gmap = gmap_shadow(vcpu->arch.gmap, asce, edat); 1283 + gmap = gmap_create_shadow(vcpu->arch.mc, vcpu->kvm->arch.gmap, asce, edat); 1248 1284 if (IS_ERR(gmap)) 1249 - return PTR_ERR(gmap); 1250 - vcpu->kvm->stat.gmap_shadow_create++; 1251 - WRITE_ONCE(vsie_page->gmap, gmap); 1252 - return 0; 1285 + return gmap; 1286 + scoped_guard(spinlock, &vcpu->kvm->arch.gmap->children_lock) { 1287 + /* unlikely race condition, remove the previous shadow */ 1288 + if (vsie_page->gmap_cache.gmap) 1289 + release_gmap_shadow(vsie_page); 1290 + vcpu->kvm->stat.gmap_shadow_create++; 1291 + list_add(&vsie_page->gmap_cache.list, &gmap->scb_users); 1292 + vsie_page->gmap_cache.gmap = gmap; 1293 + prefix_unmapped(vsie_page); 1294 + gmap_get(gmap); 1295 + } 1296 + return gmap; 1253 1297 } 1254 1298 1255 1299 /* ··· 1326 1330 int rc = 0; 1327 1331 1328 1332 while (1) { 1329 - rc = acquire_gmap_shadow(vcpu, vsie_page); 1330 - sg = vsie_page->gmap; 1333 + sg = acquire_gmap_shadow(vcpu, vsie_page); 1334 + if (IS_ERR(sg)) { 1335 + rc = PTR_ERR(sg); 1336 + sg = NULL; 1337 + } 1331 1338 if (!rc) 1332 1339 rc = map_prefix(vcpu, vsie_page, sg); 1333 1340 if (!rc) { ··· 1358 1359 kvm_s390_rewind_psw(vcpu, 4); 1359 1360 break; 1360 1361 } 1362 + if (sg) 1363 + sg = gmap_put(sg); 1364 + cond_resched(); 1361 1365 } 1362 1366 1363 1367 if (rc == -EFAULT) { ··· 1457 1455 vsie_page->scb_gpa = ULONG_MAX; 1458 1456 1459 1457 /* Double use of the same address or allocation failure. */ 1460 - if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, 1461 - vsie_page)) { 1458 + if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, vsie_page)) { 1462 1459 put_vsie_page(vsie_page); 1463 1460 mutex_unlock(&kvm->arch.vsie.mutex); 1464 1461 return NULL; ··· 1466 1465 mutex_unlock(&kvm->arch.vsie.mutex); 1467 1466 1468 1467 memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block)); 1469 - release_gmap_shadow(vsie_page); 1468 + if (vsie_page->gmap_cache.gmap) { 1469 + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) 1470 + if (vsie_page->gmap_cache.gmap) 1471 + release_gmap_shadow(vsie_page); 1472 + } 1473 + prefix_unmapped(vsie_page); 1470 1474 vsie_page->fault_addr = 0; 1471 1475 vsie_page->scb_s.ihcpu = 0xffffU; 1472 1476 return vsie_page; ··· 1549 1543 mutex_lock(&kvm->arch.vsie.mutex); 1550 1544 for (i = 0; i < kvm->arch.vsie.page_count; i++) { 1551 1545 vsie_page = kvm->arch.vsie.pages[i]; 1546 + scoped_guard(spinlock, &kvm->arch.gmap->children_lock) 1547 + if (vsie_page->gmap_cache.gmap) 1548 + release_gmap_shadow(vsie_page); 1552 1549 kvm->arch.vsie.pages[i] = NULL; 1553 - release_gmap_shadow(vsie_page); 1554 1550 /* free the radix tree entry */ 1555 1551 if (vsie_page->scb_gpa != ULONG_MAX) 1556 1552 radix_tree_delete(&kvm->arch.vsie.addr_to_page,
+26 -158
arch/s390/lib/uaccess.c
··· 34 34 } 35 35 #endif /*CONFIG_DEBUG_ENTRY */ 36 36 37 - union oac { 38 - unsigned int val; 39 - struct { 40 - struct { 41 - unsigned short key : 4; 42 - unsigned short : 4; 43 - unsigned short as : 2; 44 - unsigned short : 4; 45 - unsigned short k : 1; 46 - unsigned short a : 1; 47 - } oac1; 48 - struct { 49 - unsigned short key : 4; 50 - unsigned short : 4; 51 - unsigned short as : 2; 52 - unsigned short : 4; 53 - unsigned short k : 1; 54 - unsigned short a : 1; 55 - } oac2; 56 - }; 57 - }; 58 - 59 - static uaccess_kmsan_or_inline __must_check unsigned long 60 - raw_copy_from_user_key(void *to, const void __user *from, unsigned long size, unsigned long key) 61 - { 62 - unsigned long osize; 63 - union oac spec = { 64 - .oac2.key = key, 65 - .oac2.as = PSW_BITS_AS_SECONDARY, 66 - .oac2.k = 1, 67 - .oac2.a = 1, 68 - }; 69 - int cc; 70 - 71 - while (1) { 72 - osize = size; 73 - asm_inline volatile( 74 - " lr %%r0,%[spec]\n" 75 - "0: mvcos %[to],%[from],%[size]\n" 76 - "1: nopr %%r7\n" 77 - CC_IPM(cc) 78 - EX_TABLE_UA_MVCOS_FROM(0b, 0b) 79 - EX_TABLE_UA_MVCOS_FROM(1b, 0b) 80 - : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char *)to) 81 - : [spec] "d" (spec.val), [from] "Q" (*(const char __user *)from) 82 - : CC_CLOBBER_LIST("memory", "0")); 83 - if (CC_TRANSFORM(cc) == 0) 84 - return osize - size; 85 - size -= 4096; 86 - to += 4096; 87 - from += 4096; 88 - } 89 - } 90 - 91 - unsigned long _copy_from_user_key(void *to, const void __user *from, 92 - unsigned long n, unsigned long key) 93 - { 94 - unsigned long res = n; 95 - 96 - might_fault(); 97 - if (!should_fail_usercopy()) { 98 - instrument_copy_from_user_before(to, from, n); 99 - res = raw_copy_from_user_key(to, from, n, key); 100 - instrument_copy_from_user_after(to, from, n, res); 101 - } 102 - if (unlikely(res)) 103 - memset(to + (n - res), 0, res); 104 - return res; 105 - } 106 - EXPORT_SYMBOL(_copy_from_user_key); 107 - 108 - static uaccess_kmsan_or_inline __must_check unsigned long 109 - raw_copy_to_user_key(void __user *to, const void *from, unsigned long size, unsigned long key) 110 - { 111 - unsigned long osize; 112 - union oac spec = { 113 - .oac1.key = key, 114 - .oac1.as = PSW_BITS_AS_SECONDARY, 115 - .oac1.k = 1, 116 - .oac1.a = 1, 117 - }; 118 - int cc; 119 - 120 - while (1) { 121 - osize = size; 122 - asm_inline volatile( 123 - " lr %%r0,%[spec]\n" 124 - "0: mvcos %[to],%[from],%[size]\n" 125 - "1: nopr %%r7\n" 126 - CC_IPM(cc) 127 - EX_TABLE_UA_MVCOS_TO(0b, 0b) 128 - EX_TABLE_UA_MVCOS_TO(1b, 0b) 129 - : CC_OUT(cc, cc), [size] "+d" (size), [to] "=Q" (*(char __user *)to) 130 - : [spec] "d" (spec.val), [from] "Q" (*(const char *)from) 131 - : CC_CLOBBER_LIST("memory", "0")); 132 - if (CC_TRANSFORM(cc) == 0) 133 - return osize - size; 134 - size -= 4096; 135 - to += 4096; 136 - from += 4096; 137 - } 138 - } 139 - 140 - unsigned long _copy_to_user_key(void __user *to, const void *from, 141 - unsigned long n, unsigned long key) 142 - { 143 - might_fault(); 144 - if (should_fail_usercopy()) 145 - return n; 146 - instrument_copy_to_user(to, from, n); 147 - return raw_copy_to_user_key(to, from, n, key); 148 - } 149 - EXPORT_SYMBOL(_copy_to_user_key); 150 - 151 37 #define CMPXCHG_USER_KEY_MAX_LOOPS 128 152 38 153 - static nokprobe_inline int __cmpxchg_user_key_small(unsigned long address, unsigned int *uval, 154 - unsigned int old, unsigned int new, 155 - unsigned int mask, unsigned long key) 39 + static nokprobe_inline int __cmpxchg_key_small(void *address, unsigned int *uval, 40 + unsigned int old, unsigned int new, 41 + unsigned int mask, unsigned long key) 156 42 { 157 43 unsigned long count; 158 44 unsigned int prev; 159 - bool sacf_flag; 160 45 int rc = 0; 161 46 162 47 skey_regions_initialize(); 163 - sacf_flag = enable_sacf_uaccess(); 164 48 asm_inline volatile( 165 49 "20: spka 0(%[key])\n" 166 - " sacf 256\n" 167 50 " llill %[count],%[max_loops]\n" 168 51 "0: l %[prev],%[address]\n" 169 52 "1: nr %[prev],%[mask]\n" ··· 61 178 " nr %[tmp],%[mask]\n" 62 179 " jnz 5f\n" 63 180 " brct %[count],2b\n" 64 - "5: sacf 768\n" 65 - " spka %[default_key]\n" 181 + "5: spka %[default_key]\n" 66 182 "21:\n" 67 183 EX_TABLE_UA_LOAD_REG(0b, 5b, %[rc], %[prev]) 68 184 EX_TABLE_UA_LOAD_REG(1b, 5b, %[rc], %[prev]) ··· 79 197 [default_key] "J" (PAGE_DEFAULT_KEY), 80 198 [max_loops] "J" (CMPXCHG_USER_KEY_MAX_LOOPS) 81 199 : "memory", "cc"); 82 - disable_sacf_uaccess(sacf_flag); 83 200 *uval = prev; 84 201 if (!count) 85 202 rc = -EAGAIN; 86 203 return rc; 87 204 } 88 205 89 - int __kprobes __cmpxchg_user_key1(unsigned long address, unsigned char *uval, 90 - unsigned char old, unsigned char new, unsigned long key) 206 + int __kprobes __cmpxchg_key1(void *addr, unsigned char *uval, unsigned char old, 207 + unsigned char new, unsigned long key) 91 208 { 209 + unsigned long address = (unsigned long)addr; 92 210 unsigned int prev, shift, mask, _old, _new; 93 211 int rc; 94 212 ··· 97 215 _old = (unsigned int)old << shift; 98 216 _new = (unsigned int)new << shift; 99 217 mask = ~(0xff << shift); 100 - rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key); 218 + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); 101 219 *uval = prev >> shift; 102 220 return rc; 103 221 } 104 - EXPORT_SYMBOL(__cmpxchg_user_key1); 222 + EXPORT_SYMBOL(__cmpxchg_key1); 105 223 106 - int __kprobes __cmpxchg_user_key2(unsigned long address, unsigned short *uval, 107 - unsigned short old, unsigned short new, unsigned long key) 224 + int __kprobes __cmpxchg_key2(void *addr, unsigned short *uval, unsigned short old, 225 + unsigned short new, unsigned long key) 108 226 { 227 + unsigned long address = (unsigned long)addr; 109 228 unsigned int prev, shift, mask, _old, _new; 110 229 int rc; 111 230 ··· 115 232 _old = (unsigned int)old << shift; 116 233 _new = (unsigned int)new << shift; 117 234 mask = ~(0xffff << shift); 118 - rc = __cmpxchg_user_key_small(address, &prev, _old, _new, mask, key); 235 + rc = __cmpxchg_key_small((void *)address, &prev, _old, _new, mask, key); 119 236 *uval = prev >> shift; 120 237 return rc; 121 238 } 122 - EXPORT_SYMBOL(__cmpxchg_user_key2); 239 + EXPORT_SYMBOL(__cmpxchg_key2); 123 240 124 - int __kprobes __cmpxchg_user_key4(unsigned long address, unsigned int *uval, 125 - unsigned int old, unsigned int new, unsigned long key) 241 + int __kprobes __cmpxchg_key4(void *address, unsigned int *uval, unsigned int old, 242 + unsigned int new, unsigned long key) 126 243 { 127 244 unsigned int prev = old; 128 - bool sacf_flag; 129 245 int rc = 0; 130 246 131 247 skey_regions_initialize(); 132 - sacf_flag = enable_sacf_uaccess(); 133 248 asm_inline volatile( 134 249 "20: spka 0(%[key])\n" 135 - " sacf 256\n" 136 250 "0: cs %[prev],%[new],%[address]\n" 137 - "1: sacf 768\n" 138 - " spka %[default_key]\n" 251 + "1: spka %[default_key]\n" 139 252 "21:\n" 140 253 EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) 141 254 EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) ··· 143 264 [key] "a" (key << 4), 144 265 [default_key] "J" (PAGE_DEFAULT_KEY) 145 266 : "memory", "cc"); 146 - disable_sacf_uaccess(sacf_flag); 147 267 *uval = prev; 148 268 return rc; 149 269 } 150 - EXPORT_SYMBOL(__cmpxchg_user_key4); 270 + EXPORT_SYMBOL(__cmpxchg_key4); 151 271 152 - int __kprobes __cmpxchg_user_key8(unsigned long address, unsigned long *uval, 153 - unsigned long old, unsigned long new, unsigned long key) 272 + int __kprobes __cmpxchg_key8(void *address, unsigned long *uval, unsigned long old, 273 + unsigned long new, unsigned long key) 154 274 { 155 275 unsigned long prev = old; 156 - bool sacf_flag; 157 276 int rc = 0; 158 277 159 278 skey_regions_initialize(); 160 - sacf_flag = enable_sacf_uaccess(); 161 279 asm_inline volatile( 162 280 "20: spka 0(%[key])\n" 163 - " sacf 256\n" 164 281 "0: csg %[prev],%[new],%[address]\n" 165 - "1: sacf 768\n" 166 - " spka %[default_key]\n" 282 + "1: spka %[default_key]\n" 167 283 "21:\n" 168 284 EX_TABLE_UA_LOAD_REG(0b, 1b, %[rc], %[prev]) 169 285 EX_TABLE_UA_LOAD_REG(1b, 1b, %[rc], %[prev]) ··· 170 296 [key] "a" (key << 4), 171 297 [default_key] "J" (PAGE_DEFAULT_KEY) 172 298 : "memory", "cc"); 173 - disable_sacf_uaccess(sacf_flag); 174 299 *uval = prev; 175 300 return rc; 176 301 } 177 - EXPORT_SYMBOL(__cmpxchg_user_key8); 302 + EXPORT_SYMBOL(__cmpxchg_key8); 178 303 179 - int __kprobes __cmpxchg_user_key16(unsigned long address, __uint128_t *uval, 180 - __uint128_t old, __uint128_t new, unsigned long key) 304 + int __kprobes __cmpxchg_key16(void *address, __uint128_t *uval, __uint128_t old, 305 + __uint128_t new, unsigned long key) 181 306 { 182 307 __uint128_t prev = old; 183 - bool sacf_flag; 184 308 int rc = 0; 185 309 186 310 skey_regions_initialize(); 187 - sacf_flag = enable_sacf_uaccess(); 188 311 asm_inline volatile( 189 312 "20: spka 0(%[key])\n" 190 - " sacf 256\n" 191 313 "0: cdsg %[prev],%[new],%[address]\n" 192 - "1: sacf 768\n" 193 - " spka %[default_key]\n" 314 + "1: spka %[default_key]\n" 194 315 "21:\n" 195 316 EX_TABLE_UA_LOAD_REGPAIR(0b, 1b, %[rc], %[prev]) 196 317 EX_TABLE_UA_LOAD_REGPAIR(1b, 1b, %[rc], %[prev]) ··· 197 328 [key] "a" (key << 4), 198 329 [default_key] "J" (PAGE_DEFAULT_KEY) 199 330 : "memory", "cc"); 200 - disable_sacf_uaccess(sacf_flag); 201 331 *uval = prev; 202 332 return rc; 203 333 } 204 - EXPORT_SYMBOL(__cmpxchg_user_key16); 334 + EXPORT_SYMBOL(__cmpxchg_key16);
+2 -36
arch/s390/mm/gmap_helpers.c
··· 34 34 free_swap_and_cache(entry); 35 35 } 36 36 37 - static inline pgste_t pgste_get_lock(pte_t *ptep) 38 - { 39 - unsigned long value = 0; 40 - #ifdef CONFIG_PGSTE 41 - unsigned long *ptr = (unsigned long *)(ptep + PTRS_PER_PTE); 42 - 43 - do { 44 - value = __atomic64_or_barrier(PGSTE_PCL_BIT, ptr); 45 - } while (value & PGSTE_PCL_BIT); 46 - value |= PGSTE_PCL_BIT; 47 - #endif 48 - return __pgste(value); 49 - } 50 - 51 - static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste) 52 - { 53 - #ifdef CONFIG_PGSTE 54 - barrier(); 55 - WRITE_ONCE(*(unsigned long *)(ptep + PTRS_PER_PTE), pgste_val(pgste) & ~PGSTE_PCL_BIT); 56 - #endif 57 - } 58 - 59 37 /** 60 38 * gmap_helper_zap_one_page() - discard a page if it was swapped. 61 39 * @mm: the mm ··· 46 68 void gmap_helper_zap_one_page(struct mm_struct *mm, unsigned long vmaddr) 47 69 { 48 70 struct vm_area_struct *vma; 49 - unsigned long pgstev; 50 71 spinlock_t *ptl; 51 - pgste_t pgste; 52 72 pte_t *ptep; 53 73 54 74 mmap_assert_locked(mm); ··· 61 85 if (unlikely(!ptep)) 62 86 return; 63 87 if (pte_swap(*ptep)) { 64 - preempt_disable(); 65 - pgste = pgste_get_lock(ptep); 66 - pgstev = pgste_val(pgste); 67 - 68 - if ((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED || 69 - (pgstev & _PGSTE_GPS_ZERO)) { 70 - ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); 71 - pte_clear(mm, vmaddr, ptep); 72 - } 73 - 74 - pgste_set_unlock(ptep, pgste); 75 - preempt_enable(); 88 + ptep_zap_softleaf_entry(mm, softleaf_from_pte(*ptep)); 89 + pte_clear(mm, vmaddr, ptep); 76 90 } 77 91 pte_unmap_unlock(ptep, ptl); 78 92 }